diff --git a/.travis.yml b/.travis.yml index 360143c5c7b44..a9d23b6ea7f25 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,9 @@ env: - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.0.17 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.1.14 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.2.10 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.0.17 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.1.14 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.2.10 - TRAVIS_FLAVOR=couch FLAVOR_VERSION=1.6.1 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=v0.6.4 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=0.7.2 diff --git a/cassandra_nodetool/CHANGELOG.md b/cassandra_nodetool/CHANGELOG.md new file mode 100644 index 0000000000000..8b810cb97a04d --- /dev/null +++ b/cassandra_nodetool/CHANGELOG.md @@ -0,0 +1,8 @@ +# CHANGELOG - Cassandra Nodetool Check + +0.1.0/ Unreleased +================== + +### Changes + +* [FEATURE] adds cassandra_nodetool integration. diff --git a/cassandra_nodetool/README.md b/cassandra_nodetool/README.md new file mode 100644 index 0000000000000..c5706395e26b1 --- /dev/null +++ b/cassandra_nodetool/README.md @@ -0,0 +1,62 @@ +# Agent Check: Cassandra Nodetool + +# Overview + +This check collects metrics for your Cassandra cluster that are not available through [jmx integration](https://github.com/DataDog/integrations-core/tree/master/cassandra). +It uses the `nodetool` utility to collect them. + +# Installation + +The varnish check is packaged with the Agent, so simply [install the Agent](https://app.datadoghq.com/account/settings#agent) on your cassandra nodes. +If you need the newest version of the check, install the `dd-check-cassandra_nodetool` package. + +# Configuration + +Create a file `cassandra_nodetool.yaml` in the Agent's `conf.d` directory: +``` +init_config: + # command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) + # can be overwritten on an instance + # nodetool: /usr/bin/nodetool + +instances: + + # the list of keyspaces to monitor + - keyspaces: [] + + # host that nodetool will connect to. + # host: localhost + + # the port JMX is listening to for connections. + # port: 7199 + + # a set of credentials to connect to the host. These are the credentials for the JMX server. + # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command + # username: + # password: + + # a list of additionnal tags to be sent with the metrics + # tags: [] +``` + +# Validation + +When you run `datadog-agent info` you should see something like the following: + + Checks + ====== + + cassandra_nodetool + ----------- + - instance #0 [OK] + - Collected 39 metrics, 0 events & 7 service checks + +# Compatibility + +The `cassandra_nodetool` check is compatible with all major platforms + +# Service Checks + +**cassandra.nodetool.node_up**: + +The agent sends this service check for each node of the monitored cluster. Returns CRITICAL if the node is down, otherwise OK. diff --git a/cassandra_nodetool/check.py b/cassandra_nodetool/check.py new file mode 100644 index 0000000000000..13c8e113709c4 --- /dev/null +++ b/cassandra_nodetool/check.py @@ -0,0 +1,136 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +import re +import shlex + +# project +from checks import AgentCheck +from utils.subprocess_output import get_subprocess_output +from collections import defaultdict + +EVENT_TYPE = SOURCE_TYPE_NAME = 'cassandra_nodetool' +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = '7199' +TO_BYTES = { + 'B': 1, + 'KB': 1e3, + 'MB': 1e6, + 'GB': 1e9, + 'TB': 1e12, +} + +class CassandraNodetoolCheck(AgentCheck): + + datacenter_name_re = re.compile('^Datacenter: (.*)') + node_status_re = re.compile('^(?P[UD])[NLJM] +(?P
\d+\.\d+\.\d+\.\d+) +' + '(?P\d+\.\d*) (?P(K|M|G|T)?B) +\d+ +' + '(?P(\d+\.\d+)|\?)%? +(?P[a-fA-F0-9-]*) +(?P.*)') + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + self.nodetool_cmd = init_config.get("nodetool", "/usr/bin/nodetool") + + def check(self, instance): + # Allow to specify a complete command for nodetool such as `docker exec container nodetool` + nodetool_cmd = shlex.split(instance.get("nodetool", self.nodetool_cmd)) + host = instance.get("host", DEFAULT_HOST) + port = instance.get("port", DEFAULT_PORT) + keyspaces = instance.get("keyspaces", []) + username = instance.get("username", "") + password = instance.get("password", "") + tags = instance.get("tags", []) + + # Flag to send service checks only once and not for every keyspace + send_service_checks = True + + for keyspace in keyspaces: + # Build the nodetool command + cmd = nodetool_cmd + ['-h', host, '-p', port] + if username and password: + cmd += ['-u', username, '-pw', password] + cmd += ['status', '--', keyspace] + + # Execute the command + out, err, _ = get_subprocess_output(cmd, self.log, False) + if err or 'Error:' in out: + self.log.error('Error executing nodetool status: %s', err or out) + continue + nodes = self._process_nodetool_output(out) + + percent_up_by_dc = defaultdict(float) + percent_total_by_dc = defaultdict(float) + # Send the stats per node and compute the stats per datacenter + for node in nodes: + + node_tags = ['node_address:%s' % node['address'], + 'node_id:%s' % node['id'], + 'datacenter:%s' % node['datacenter'], + 'rack:%s' % node['rack']] + + # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system) + # don't send metric in this case + if node['owns'] != '?': + owns = float(node['owns']) + if node['status'] == 'U': + percent_up_by_dc[node['datacenter']] += owns + percent_total_by_dc[node['datacenter']] += owns + self.gauge('cassandra.nodetool.status.owns', owns, + tags=tags + node_tags + ['keyspace:%s' % keyspace]) + + # Send service check only once for each node + if send_service_checks: + status = AgentCheck.OK if node['status'] == 'U' else AgentCheck.CRITICAL + self.service_check('cassandra.nodetool.node_up', status, tags + node_tags) + + self.gauge('cassandra.nodetool.status.status', 1 if node['status'] == 'U' else 0, + tags=tags + node_tags) + self.gauge('cassandra.nodetool.status.load', float(node['load']) * TO_BYTES[node['load_unit']], + tags=tags + node_tags) + + # All service checks have been sent, don't resend + send_service_checks = False + + # Send the stats per datacenter + for datacenter, percent_up in percent_up_by_dc.items(): + self.gauge('cassandra.nodetool.status.replication_availability', percent_up, + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + for datacenter, percent_total in percent_total_by_dc.items(): + self.gauge('cassandra.nodetool.status.replication_factor', int(round(percent_total / 100)), + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + + def _process_nodetool_output(self, output): + nodes = [] + datacenter_name = "" + for line in output.splitlines(): + # Ouput of nodetool + # Datacenter: dc1 + # =============== + # Status=Up/Down + # |/ State=Normal/Leaving/Joining/Moving + # -- Address Load Tokens Owns (effective) Host ID Rack + # UN 172.21.0.3 184.8 KB 256 38.4% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 + # UN 172.21.0.4 223.34 KB 256 39.5% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 + + match = self.datacenter_name_re.search(line) + if match: + datacenter_name = match.group(1) + continue + + match = self.node_status_re.search(line) + if match: + node = { + 'status': match.group('status'), + 'address': match.group('address'), + 'load': match.group('load'), + 'load_unit': match.group('load_unit'), + 'owns': match.group('owns'), + 'id': match.group('id'), + 'rack': match.group('rack'), + 'datacenter': datacenter_name + } + nodes.append(node) + + return nodes diff --git a/cassandra_nodetool/ci/cassandra_nodetool.rake b/cassandra_nodetool/ci/cassandra_nodetool.rake new file mode 100644 index 0000000000000..39f072691f042 --- /dev/null +++ b/cassandra_nodetool/ci/cassandra_nodetool.rake @@ -0,0 +1,94 @@ +require 'ci/common' + +def cassandra_nodetool_version + ENV['FLAVOR_VERSION'] || '2.1.14' # '2.0.17' +end + +container_name = 'dd-test-cassandra' +container_name2 = 'dd-test-cassandra2' + +container_port = 7199 +cassandra_jmx_options = "-Dcom.sun.management.jmxremote.port=#{container_port} + -Dcom.sun.management.jmxremote.rmi.port=#{container_port} + -Dcom.sun.management.jmxremote.ssl=false + -Dcom.sun.management.jmxremote.authenticate=true + -Dcom.sun.management.jmxremote.password.file=/etc/cassandra/jmxremote.password + -Djava.rmi.server.hostname=localhost" + +namespace :ci do + namespace :cassandra_nodetool do |flavor| + task before_install: ['ci:common:before_install'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + end + + task :install do + Rake::Task['ci:common:install'].invoke('cassandra_nodetool') + sh %(docker create --expose #{container_port} \ + -p #{container_port}:#{container_port} -e JMX_PORT=#{container_port} \ + -e LOCAL_JMX=no -e JVM_EXTRA_OPTS="#{cassandra_jmx_options}" --name #{container_name} cassandra:#{cassandra_nodetool_version}) + sh %(cp #{__dir__}/jmxremote.password #{__dir__}/jmxremote.password.tmp) + sh %(chmod 400 #{__dir__}/jmxremote.password.tmp) + sh %(docker cp #{__dir__}/jmxremote.password.tmp #{container_name}:/etc/cassandra/jmxremote.password) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + sh %(docker start #{container_name}) + + sh %(docker create --name #{container_name2} \ + -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" \ + cassandra:#{cassandra_nodetool_version}) + sh %(docker start #{container_name2}) + end + + task before_script: ['ci:common:before_script'] do + # Wait.for container_port + wait_on_docker_logs(container_name, 20, 'Listening for thrift clients', "Created default superuser role 'cassandra'") + wait_on_docker_logs(container_name2, 40, 'Listening for thrift clients', 'Not starting RPC server as requested') + sh %(docker exec #{container_name} cqlsh -e "CREATE KEYSPACE test WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}") + end + + task script: ['ci:common:script'] do + this_provides = [ + 'cassandra_nodetool' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task before_cache: ['ci:common:before_cache'] + + task cleanup: ['ci:common:cleanup'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + end + + task :execute do + exception = nil + begin + %w(before_install install before_script).each do |u| + Rake::Task["#{flavor.scope.path}:#{u}"].invoke + end + if !ENV['SKIP_TEST'] + Rake::Task["#{flavor.scope.path}:script"].invoke + else + puts 'Skipping tests'.yellow + end + Rake::Task["#{flavor.scope.path}:before_cache"].invoke + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + raise exception if exception + end + end +end diff --git a/cassandra_nodetool/ci/fixtures/nodetool_output b/cassandra_nodetool/ci/fixtures/nodetool_output new file mode 100644 index 0000000000000..bc671693cd216 --- /dev/null +++ b/cassandra_nodetool/ci/fixtures/nodetool_output @@ -0,0 +1,15 @@ +Datacenter: dc1 +=============== +Status=Up/Down +|/ State=Normal/Leaving/Joining/Moving +-- Address Load Tokens Owns (effective) Host ID Rack +DN 172.21.0.6 178.43 KB 256 35.4% f86d2d7a-e5c7-4c46-b36e-df08c565171a rack1 +UN 172.21.0.3 184.8 KB 256 31.0% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 +UN 172.21.0.2 182.05 KB 256 33.5% fa859fcc-5e76-44ce-9609-1f314bdf21c1 RAC1 +Datacenter: dc2 +=============== +Status=Up/Down +|/ State=Normal/Leaving/Joining/Moving +-- Address Load Tokens Owns (effective) Host ID Rack +UN 172.21.0.5 216.75 KB 256 100.0% 2250363b-7453-48f2-b6cb-ef79cad0612b RAC1 +UN 172.21.0.4 223.34 KB 256 100.0% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 \ No newline at end of file diff --git a/cassandra_nodetool/ci/jmxremote.password b/cassandra_nodetool/ci/jmxremote.password new file mode 100644 index 0000000000000..239d0f318fa6b --- /dev/null +++ b/cassandra_nodetool/ci/jmxremote.password @@ -0,0 +1 @@ +controlRole QED \ No newline at end of file diff --git a/cassandra_nodetool/conf.yaml.example b/cassandra_nodetool/conf.yaml.example new file mode 100644 index 0000000000000..f5dcd81e115d5 --- /dev/null +++ b/cassandra_nodetool/conf.yaml.example @@ -0,0 +1,23 @@ +init_config: + # command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) + # can be overwritten on an instance + # nodetool: /usr/bin/nodetool + +instances: + + # the list of keyspaces to monitor + - keyspaces: [] + + # host that nodetool will connect to. + # host: localhost + + # the port JMX is listening to for connections. + # port: 7199 + + # a set of credentials to connect to the host. These are the credentials for the JMX server. + # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command + # username: + # password: + + # a list of additionnal tags to be sent with the metrics + # tags: [] \ No newline at end of file diff --git a/cassandra_nodetool/manifest.json b/cassandra_nodetool/manifest.json new file mode 100644 index 0000000000000..c49cc0b3d0bc2 --- /dev/null +++ b/cassandra_nodetool/manifest.json @@ -0,0 +1,12 @@ +{ + "maintainer": "help@datadoghq.com", + "manifest_version": "0.1.0", + "max_agent_version": "6.0.0", + "min_agent_version": "5.6.3", + "name": "cassandra_nodetool", + "short_description": "monitor cassandra using the nodetool utility", + "guid": "00e4a8bd-8ec2-4bb4-b725-6aaa91618d13", + "support": "contrib", + "supported_os": ["linux","mac_os","windows"], + "version": "0.1.0" +} diff --git a/cassandra_nodetool/metadata.csv b/cassandra_nodetool/metadata.csv new file mode 100644 index 0000000000000..2632f34248bcd --- /dev/null +++ b/cassandra_nodetool/metadata.csv @@ -0,0 +1,6 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name +cassandra.nodetool.status.replication_availability,gauge,,percent,,Percentage of data available per keyspace times replication factor,1,cassandra_nodetool,available data +cassandra.nodetool.status.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_nodetool,replication factor +cassandra.nodetool.status.status,gauge,,,,Node status: up (1) or down (0),1,cassandra_nodetool,node status +cassandra.nodetool.status.owns,gauge,,percent,,Percentage of the data owned by the node per datacenter times the replication factor,0,cassandra_nodetool,owns +cassandra.nodetool.status.load,gauge,,byte,,Amount of file system data under the cassandra data directory without snapshot content,0,cassandra_nodetool,load \ No newline at end of file diff --git a/cassandra_nodetool/requirements.txt b/cassandra_nodetool/requirements.txt new file mode 100644 index 0000000000000..f89ecf55da555 --- /dev/null +++ b/cassandra_nodetool/requirements.txt @@ -0,0 +1 @@ +# integration pip requirements diff --git a/cassandra_nodetool/test_cassandra_nodetool.py b/cassandra_nodetool/test_cassandra_nodetool.py new file mode 100644 index 0000000000000..61d11102d96d2 --- /dev/null +++ b/cassandra_nodetool/test_cassandra_nodetool.py @@ -0,0 +1,68 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +from nose.plugins.attrib import attr +from mock import patch +from os.path import join, dirname + +# project +from tests.checks.common import AgentCheckTest, Fixtures + +FIXTURE_DIR = join(dirname(__file__), 'ci') +CASSANDRA_CONTAINER_NAME = 'dd-test-cassandra' + +def mock_output(*args): + return Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR), "", 0 + +class TestCassandraNodetoolCheck(AgentCheckTest): + """Basic Test for cassandra_check integration.""" + CHECK_NAME = 'cassandra_nodetool' + + config = { + 'instances': [ + { + 'nodetool': 'docker exec %s nodetool' % CASSANDRA_CONTAINER_NAME, + 'keyspaces': ['system', 'test'], + 'username': 'controlRole', + 'password': 'QED', + 'tags': ['foo', 'bar'] + } + ] + } + + @patch('utils.subprocess_output.get_subprocess_output', side_effect=mock_output) + def test_check(self, mock_output): + + self.run_check(self.config) + + # test per datacenter metrics + self.assertEquals(mock_output.call_args[0][0], + ['docker', 'exec', CASSANDRA_CONTAINER_NAME, 'nodetool', '-h', 'localhost', '-p', + '7199', '-u', 'controlRole', '-pw', 'QED', 'status', '--', 'test']) + self.assertMetric('cassandra.nodetool.status.replication_availability', value=64.5, + tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.replication_availability', value=200, + tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.replication_factor', value=1, + tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.replication_factor', value=2, + tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) + # test per node metrics + tags = ['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', + 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar'] + self.assertMetric('cassandra.nodetool.status.status', value=1, tags=tags) + self.assertMetric('cassandra.nodetool.status.owns', value=100, tags=tags + ['keyspace:test']) + self.assertMetric('cassandra.nodetool.status.load', value=223340, tags=tags) + self.assertServiceCheckOK('cassandra.nodetool.node_up', count=4) + self.assertServiceCheckCritical('cassandra.nodetool.node_up', count=1) + + @attr(requires='cassandra_nodetool') + def test_integration(self): + self.run_check(self.config) + + self.assertMetric('cassandra.nodetool.status.replication_availability', value=200, + tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.replication_factor', value=2, + tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) diff --git a/circle.yml b/circle.yml index 0da58bf889fa4..7417959b6c188 100644 --- a/circle.yml +++ b/circle.yml @@ -80,6 +80,7 @@ test: - rake ci:run[kafka] - rake ci:run[docker_daemon] - rake ci:run[kubernetes] + - rake ci:run[cassandra_nodetool] - bundle exec rake requirements post: - if [[ $(docker ps -a -q) ]]; then docker stop $(docker ps -a -q); fi