From 526c81b138220dbff9c847508d3db32e33145bdd Mon Sep 17 00:00:00 2001 From: Andrei Eres Date: Tue, 19 Dec 2023 12:14:22 +0100 Subject: [PATCH] subsystem benchmarks: add cpu profiling (#2734) Ready-to-merge version of https://github.com/paritytech/polkadot-sdk/pull/2601 - Added optional CPU profiling - Updated instructions how to set up Prometheus, Pyroscope and Graphana - Added a flamegraph dashboard image --------- Co-authored-by: ordian --- Cargo.lock | 2 + polkadot/node/subsystem-bench/Cargo.toml | 2 + polkadot/node/subsystem-bench/README.md | 87 ++++++++++++------- .../subsystem-bench/docker/docker-compose.yml | 35 ++++++++ .../docker/prometheus/prometheus.yml | 11 +++ .../grafana/cpu-profiling.json | 70 +++++++++++++++ .../subsystem-bench/src/subsystem-bench.rs | 29 +++++++ 7 files changed, 206 insertions(+), 30 deletions(-) create mode 100644 polkadot/node/subsystem-bench/docker/docker-compose.yml create mode 100644 polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml create mode 100644 polkadot/node/subsystem-bench/grafana/cpu-profiling.json diff --git a/Cargo.lock b/Cargo.lock index 32e68779fd1d..b65d6fc1b711 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13322,6 +13322,8 @@ dependencies = [ "polkadot-primitives", "polkadot-primitives-test-helpers", "prometheus", + "pyroscope", + "pyroscope_pprofrs", "rand 0.8.5", "sc-keystore", "sc-network", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index cf9fd8822dde..6504c8f714de 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -56,6 +56,8 @@ serde = "1.0.192" serde_yaml = "0.9" paste = "1.0.14" orchestra = { version = "0.3.3", default-features = false, features = ["futures_channel"] } +pyroscope = "0.5.7" +pyroscope_pprofrs = "0.2.7" [features] default = [] diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 21844853334b..b1476db27548 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -1,6 +1,6 @@ # Subsystem benchmark client -Run parachain consensus stress and performance tests on your development machine. +Run parachain consensus stress and performance tests on your development machine. ## Motivation @@ -26,17 +26,26 @@ The output binary will be placed in `target/testnet/subsystem-bench`. ### Test metrics -Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. +Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, a local Grafana/Prometheus stack is needed. +### Run Prometheus, Pyroscope and Graphana in Docker + +If docker is not usable, then follow the next sections to manually install Prometheus, Pyroscope and Graphana on your machine. + +```bash +cd polkadot/node/subsystem-bench/docker +docker compose up +``` + ### Install Prometheus Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your platform/OS. After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it -will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation +will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` prometheus.yml: @@ -57,13 +66,29 @@ scrape_configs: To complete this step restart Prometheus server such that it picks up the new configuration. -### Install and setup Grafana +### Install Pyroscope + +To collect CPU profiling data, you must be running the Pyroscope server. +Follow the [installation guide](https://grafana.com/docs/pyroscope/latest/get-started/) +relevant to your operating system. + +### Install Grafana Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant to your operating system. -Once you have the installation up and running, configure the local Prometheus as a data source by following -[this guide](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) +### Setup Grafana + +Once you have the installation up and running, configure the local Prometheus and Pyroscope (if needed) +as data sources by following these guides: + +- [Prometheus](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) +- [Pyroscope](https://grafana.com/docs/grafana/latest/datasources/grafana-pyroscope/) + +If you are running the servers in Docker, use the following URLs: + +- Prometheus `http://prometheus:9090/` +- Pyroscope `http://pyroscope:4040/` #### Import dashboards @@ -86,26 +111,29 @@ Commands: ``` Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically - used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). +used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). ### Standard test options - + ``` Options: - --network The type of network to be emulated [default: ideal] [possible values: - ideal, healthy, degraded] - --n-cores Number of cores to fetch availability for [default: 100] - --n-validators Number of validators to fetch chunks from [default: 500] - --min-pov-size The minimum pov size in KiB [default: 5120] - --max-pov-size The maximum pov size bytes [default: 5120] - -n, --num-blocks The number of blocks the test is going to run [default: 1] - -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB - -b, --bandwidth The bandwidth of our simulated node in KiB - --peer-error Simulated conection error ratio [0-100] - --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] - --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] - -h, --help Print help - -V, --version Print version + --network The type of network to be emulated [default: ideal] [possible values: + ideal, healthy, degraded] + --n-cores Number of cores to fetch availability for [default: 100] + --n-validators Number of validators to fetch chunks from [default: 500] + --min-pov-size The minimum pov size in KiB [default: 5120] + --max-pov-size The maximum pov size bytes [default: 5120] + -n, --num-blocks The number of blocks the test is going to run [default: 1] + -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB + -b, --bandwidth The bandwidth of our simulated node in KiB + --peer-error Simulated conection error ratio [0-100] + --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] + --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + --profile Enable CPU Profiling with Pyroscope + --pyroscope-url Pyroscope Server URL [default: http://localhost:4040] + --pyroscope-sample-rate Pyroscope Sample Rate [default: 113] + -h, --help Print help + -V, --version Print version ``` These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. @@ -123,8 +151,8 @@ Benchmark availability recovery strategies Usage: subsystem-bench data-availability-read [OPTIONS] Options: - -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU - as we don't need to re-construct from chunks. Tipically this is only faster if nodes + -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU + as we don't need to re-construct from chunks. Tipically this is only faster if nodes have enough bandwidth -h, --help Print help ``` @@ -152,8 +180,8 @@ Let's run an availabilty read test which will recover availability for 10 cores node validator network. ``` - target/testnet/subsystem-bench --n-cores 10 data-availability-read -[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, + target/testnet/subsystem-bench --n-cores 10 data-availability-read +[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = None [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880 [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment. @@ -167,8 +195,8 @@ node validator network. [2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms [2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block [2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms -[2023-11-28T09:02:07Z INFO subsystem_bench::availability] - +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] + Total received from network: 66 MiB Total sent to network: 58 KiB Total subsystem CPU usage 4.16s @@ -192,8 +220,7 @@ view the test progress in real time by accessing [this link](http://localhost:30 Now run `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` -and view the metrics in real time and spot differences between different `n_valiator` values. - +and view the metrics in real time and spot differences between different `n_validators` values. ## Create new test objectives This tool is intended to make it easy to write new test objectives that focus individual subsystems, diff --git a/polkadot/node/subsystem-bench/docker/docker-compose.yml b/polkadot/node/subsystem-bench/docker/docker-compose.yml new file mode 100644 index 000000000000..fc5eb1f634e6 --- /dev/null +++ b/polkadot/node/subsystem-bench/docker/docker-compose.yml @@ -0,0 +1,35 @@ +services: + grafana: + image: grafana/grafana-enterprise:latest + container_name: grafana + restart: always + networks: + - subsystem-bench + ports: + - "3000:3000" + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: always + networks: + - subsystem-bench + volumes: + - ./prometheus:/etc/prometheus + extra_hosts: + - "host.docker.internal:host-gateway" + ports: + - "9090:9090" + - "9999:9999" + + pyroscope: + container_name: pyroscope + image: grafana/pyroscope:latest + restart: always + networks: + - subsystem-bench + ports: + - "4040:4040" + +networks: + subsystem-bench: diff --git a/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml b/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml new file mode 100644 index 000000000000..0bb25cfcb36c --- /dev/null +++ b/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "subsystem-bench" + scrape_interval: 0s500ms + static_configs: + - targets: ['host.docker.internal:9999'] diff --git a/polkadot/node/subsystem-bench/grafana/cpu-profiling.json b/polkadot/node/subsystem-bench/grafana/cpu-profiling.json new file mode 100644 index 000000000000..0d53a1b93657 --- /dev/null +++ b/polkadot/node/subsystem-bench/grafana/cpu-profiling.json @@ -0,0 +1,70 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" + }, + "gridPos": { + "h": 18, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" + }, + "groupBy": [], + "labelSelector": "{service_name=\"subsystem-bench\"}", + "profileTypeId": "process_cpu:cpu:nanoseconds:cpu:nanoseconds", + "queryType": "profile", + "refId": "A" + } + ], + "title": "CPU Profiling", + "type": "flamegraph" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CPU Profiling", + "uid": "c31191d5-fe2b-49e2-8b1c-1451f31d1628", + "version": 1, + "weekStart": "" + } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index da7e5441f748..29b62b27855a 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -18,6 +18,8 @@ //! CI regression testing. use clap::Parser; use color_eyre::eyre; +use pyroscope::PyroscopeAgent; +use pyroscope_pprofrs::{pprof_backend, PprofConfig}; use colored::Colorize; use std::{path::Path, time::Duration}; @@ -76,12 +78,34 @@ struct BenchCli { /// Maximum remote peer latency in milliseconds [0-5000]. pub peer_max_latency: Option, + #[clap(long, default_value_t = false)] + /// Enable CPU Profiling with Pyroscope + pub profile: bool, + + #[clap(long, requires = "profile", default_value_t = String::from("http://localhost:4040"))] + /// Pyroscope Server URL + pub pyroscope_url: String, + + #[clap(long, requires = "profile", default_value_t = 113)] + /// Pyroscope Sample Rate + pub pyroscope_sample_rate: u32, + #[command(subcommand)] pub objective: cli::TestObjective, } impl BenchCli { fn launch(self) -> eyre::Result<()> { + let agent_running = if self.profile { + let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench") + .backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate))) + .build()?; + + Some(agent.start()?) + } else { + None + }; + let configuration = self.standard_configuration; let mut test_config = match self.objective { TestObjective::TestSequence(options) => { @@ -165,6 +189,11 @@ impl BenchCli { env.runtime() .block_on(availability::benchmark_availability_read(&mut env, state)); + if let Some(agent_running) = agent_running { + let agent_ready = agent_running.stop()?; + agent_ready.shutdown(); + } + Ok(()) } }