Skip to content

Commit

Permalink
[monitor-opentelemetry] Live Metrics: Perf Counter update (#31341)
Browse files Browse the repository at this point in the history
### Packages impacted by this PR
[monitor-opentelemetry]

### Describe the problem that is addressed by this PR
See:
https://github.com/aep-health-and-standards/Telemetry-Collection-Spec/blob/main/ApplicationInsights/livemetrics.md#process-metrics

### What are the possible designs available to address the problem? If
there are more than one possible design, why was the one in this PR
chosen?
For CPU, looked at
[.net](https://github.com/Azure/azure-sdk-for-net/blob/7c8f16deec3a1150bfa04ec372d3abe5b238dd18/sdk/monitor/Azure.Monitor.OpenTelemetry.AspNetCore/src/LiveMetrics/Manager.Metrics.cs#L290)
&
[java](https://github.com/Azure/azure-sdk-for-java/blob/43bdc30cb296915ec22aa46ba1e7d885e124f6b4/sdk/monitor/azure-monitor-opentelemetry-exporter/src/main/java/com/azure/monitor/opentelemetry/exporter/implementation/quickpulse/QuickPulseDataCollector.java#L358)
implementations, along with this [stack
overflow](https://stackoverflow.com/questions/74776323/trying-to-get-normalized-cpu-usage-for-node-process#:~:text=I'm%20trying%20to%20calculate%20the%20normalized%20cpu%20percentage%20for%20my).
They all seem to be following the same rough calculation & multiplying
by 100 in the end for a percentage.


### Are there test cases added in this PR? _(If not, why?)_
Modified an existing test to assert that the body of the post request
includes old/new counter names with the process counter values.

### Checklists
- [x] Added impacted package name to the issue description
- [ ] Does this PR needs any fixes in the SDK Generator?** _(If so,
create an Issue in the
[Autorest/typescript](https://github.com/Azure/autorest.typescript)
repository and link it here)_
- [x] Added a changelog (if necessary)
  • Loading branch information
harsimar authored Oct 16, 2024
1 parent 2d33359 commit 3a21d19
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 90 deletions.
1 change: 1 addition & 0 deletions sdk/monitor/monitor-opentelemetry/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## 1.7.2 (Unreleased)

### Features Added
- Changed live metrics CPU/Memory perf counter metrics to emit normalized process CPU and process physical memory bytes.
- Support for Live Metrics Filtering

### Breaking Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ import {
MetricFailureToCreateError,
} from "./filtering/quickpulseErrors";
import { SEMATTRS_EXCEPTION_TYPE } from "@opentelemetry/semantic-conventions";
import { getPhysicalMemory, getProcessorTimeNormalized } from "../utils";

const POST_INTERVAL = 1000;
const MAX_POST_WAIT_TIME = 20000;
Expand All @@ -99,8 +100,8 @@ export class LiveMetrics {
private requestFailedRateGauge: ObservableGauge | undefined;
private dependencyRateGauge: ObservableGauge | undefined;
private dependencyFailedRateGauge: ObservableGauge | undefined;
private memoryCommitedGauge: ObservableGauge | undefined;
private processorTimeGauge: ObservableGauge | undefined;
private processPhysicalBytesGauge: ObservableGauge | undefined;
private percentProcessorTimeNormalizedGauge: ObservableGauge | undefined;
private exceptionsRateGauge: ObservableGauge | undefined;

private documents: DocumentIngress[] = [];
Expand Down Expand Up @@ -135,13 +136,8 @@ export class LiveMetrics {
private lastDependencyRate: { count: number; time: number } = { count: 0, time: 0 };
private lastFailedDependencyRate: { count: number; time: number } = { count: 0, time: 0 };
private lastExceptionRate: { count: number; time: number } = { count: 0, time: 0 };
private lastCpus:
| {
model: string;
speed: number;
times: { user: number; nice: number; sys: number; idle: number; irq: number };
}[]
| undefined;
private lastCpuUsage: NodeJS.CpuUsage;
private lastHrTime: bigint;
private statsbeatOptionsUpdated = false;
private etag: string = "";
private errorTracker: CollectionConfigurationErrorTracker =
Expand Down Expand Up @@ -209,6 +205,8 @@ export class LiveMetrics {
// eslint-disable-next-line @typescript-eslint/no-misused-promises
this.handle = <any>setTimeout(this.goQuickpulse.bind(this), this.pingInterval);
this.handle.unref(); // Don't block apps from terminating
this.lastCpuUsage = process.cpuUsage();
this.lastHrTime = process.hrtime.bigint();
}

public shutdown(): void {
Expand Down Expand Up @@ -299,7 +297,6 @@ export class LiveMetrics {
getInstance().setStatsbeatFeatures({}, { liveMetrics: true });
this.statsbeatOptionsUpdated = true;
}
this.lastCpus = os.cpus();
this.totalDependencyCount = 0;
this.totalExceptionCount = 0;
this.totalFailedDependencyCount = 0;
Expand Down Expand Up @@ -363,15 +360,15 @@ export class LiveMetrics {
},
);

this.memoryCommitedGauge = this.meter.createObservableGauge(
QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES,
this.processPhysicalBytesGauge = this.meter.createObservableGauge(
QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES,
{
valueType: ValueType.INT,
},
);

this.processorTimeGauge = this.meter.createObservableGauge(
QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME,
this.percentProcessorTimeNormalizedGauge = this.meter.createObservableGauge(
QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED,
{
valueType: ValueType.DOUBLE,
},
Expand All @@ -390,8 +387,10 @@ export class LiveMetrics {
this.dependencyRateGauge.addCallback(this.getDependencyRate.bind(this));
this.dependencyFailedRateGauge.addCallback(this.getDependencyFailedRate.bind(this));
this.exceptionsRateGauge.addCallback(this.getExceptionRate.bind(this));
this.memoryCommitedGauge.addCallback(this.getCommitedMemory.bind(this));
this.processorTimeGauge.addCallback(this.getProcessorTime.bind(this));
this.processPhysicalBytesGauge.addCallback(this.getPhysicalMemory.bind(this));
this.percentProcessorTimeNormalizedGauge.addCallback(
this.getProcessorTimeNormalized.bind(this),
);
}

/**
Expand Down Expand Up @@ -652,69 +651,16 @@ export class LiveMetrics {
};
}

private getCommitedMemory(observableResult: ObservableResult): void {
const freeMem = os.freemem();
const committedMemory = os.totalmem() - freeMem;
observableResult.observe(committedMemory);
}

private getTotalCombinedCpu(
cpus: os.CpuInfo[],
lastCpus: os.CpuInfo[],
): { combinedTotal: number; totalUser: number; totalIdle: number } {
let totalUser = 0;
let totalSys = 0;
let totalNice = 0;
let totalIdle = 0;
let totalIrq = 0;
for (let i = 0; !!cpus && i < cpus.length; i++) {
const cpu = cpus[i];
const lastCpu = lastCpus[i];
const times = cpu.times;
const lastTimes = lastCpu.times;
// user cpu time (or) % CPU time spent in user space
let user = times.user - lastTimes.user;
user = user > 0 ? user : 0; // Avoid negative values
totalUser += user;
// system cpu time (or) % CPU time spent in kernel space
let sys = times.sys - lastTimes.sys;
sys = sys > 0 ? sys : 0; // Avoid negative values
totalSys += sys;
// user nice cpu time (or) % CPU time spent on low priority processes
let nice = times.nice - lastTimes.nice;
nice = nice > 0 ? nice : 0; // Avoid negative values
totalNice += nice;
// idle cpu time (or) % CPU time spent idle
let idle = times.idle - lastTimes.idle;
idle = idle > 0 ? idle : 0; // Avoid negative values
totalIdle += idle;
// irq (or) % CPU time spent servicing/handling hardware interrupts
let irq = times.irq - lastTimes.irq;
irq = irq > 0 ? irq : 0; // Avoid negative values
totalIrq += irq;
}
const combinedTotal = totalUser + totalSys + totalNice + totalIdle + totalIrq;
return {
combinedTotal: combinedTotal,
totalUser: totalUser,
totalIdle: totalIdle,
};
private getPhysicalMemory(observableResult: ObservableResult): void {
const rss = getPhysicalMemory();
observableResult.observe(rss);
}

private getProcessorTime(observableResult: ObservableResult): void {
// this reports total ms spent in each category since the OS was booted, to calculate percent it is necessary
// to find the delta since the last measurement
const cpus = os.cpus();
if (cpus && cpus.length && this.lastCpus && cpus.length === this.lastCpus.length) {
const cpuTotals = this.getTotalCombinedCpu(cpus, this.lastCpus);

const value =
cpuTotals.combinedTotal > 0
? ((cpuTotals.combinedTotal - cpuTotals.totalIdle) / cpuTotals.combinedTotal) * 100
: 0;
observableResult.observe(value);
}
this.lastCpus = cpus;
private getProcessorTimeNormalized(observableResult: ObservableResult): void {
const cpuUsagePercent = getProcessorTimeNormalized(this.lastHrTime, this.lastCpuUsage);
observableResult.observe(cpuUsagePercent);
this.lastHrTime = process.hrtime.bigint();
this.lastCpuUsage = process.cpuUsage();
}

private updateConfiguration(response: PublishResponse | IsSubscribedResponse): void {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ export interface QuickpulseExporterOptions {
}

export enum QuickPulseOpenTelemetryMetricNames {
COMMITTED_BYTES = "azureMonitor.memoryCommittedBytes",
PROCESSOR_TIME = "azureMonitor.processorTotalProcessorTime",
PHYSICAL_BYTES = "azureMonitor.physicalBytes",
PROCESSOR_TIME_NORMALIZED = "azureMonitor.percentProcessorTimeNormalized",
REQUEST_RATE = "azureMonitor.requestsSec",
REQUEST_FAILURE_RATE = "azureMonitor.requestsFailedSec",
REQUEST_DURATION = "azureMonitor.requestDuration",
Expand All @@ -43,10 +43,14 @@ export enum QuickPulseOpenTelemetryMetricNames {
}

export enum QuickPulseMetricNames {
// Memory
// Memory (old)
COMMITTED_BYTES = "\\Memory\\Committed Bytes",
// CPU
// Memory (new - current process)
PHYSICAL_BYTES = "\\Process\\Physical Bytes",
// CPU (old)
PROCESSOR_TIME = "\\Processor(_Total)\\% Processor Time",
// CPU (new - current process)
PROCESSOR_TIME_NORMALIZED = "\\% Process\\Processor Time Normalized",
// Request
REQUEST_RATE = "\\ApplicationInsights\\Requests/Sec",
REQUEST_FAILURE_RATE = "\\ApplicationInsights\\Requests Failed/Sec",
Expand Down
25 changes: 21 additions & 4 deletions sdk/monitor/monitor-opentelemetry/src/metrics/quickpulse/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ export function resourceMetricsToQuickpulseDataPoint(

// Update name to expected value in Quickpulse, needed because those names are invalid in OTel
switch (metric.descriptor.name) {
case QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES:
metricPoint.name = QuickPulseMetricNames.COMMITTED_BYTES;
case QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES:
metricPoint.name = QuickPulseMetricNames.PHYSICAL_BYTES;
break;
case QuickPulseOpenTelemetryMetricNames.DEPENDENCY_DURATION:
metricPoint.name = QuickPulseMetricNames.DEPENDENCY_DURATION;
Expand All @@ -200,8 +200,8 @@ export function resourceMetricsToQuickpulseDataPoint(
case QuickPulseOpenTelemetryMetricNames.EXCEPTION_RATE:
metricPoint.name = QuickPulseMetricNames.EXCEPTION_RATE;
break;
case QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME:
metricPoint.name = QuickPulseMetricNames.PROCESSOR_TIME;
case QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED:
metricPoint.name = QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED;
break;
case QuickPulseOpenTelemetryMetricNames.REQUEST_DURATION:
metricPoint.name = QuickPulseMetricNames.REQUEST_DURATION;
Expand All @@ -225,6 +225,23 @@ export function resourceMetricsToQuickpulseDataPoint(
metricPoint.value = (dataPoint.value as Histogram).sum || 0;
}
metricPoints.push(metricPoint);

// TODO: remove the metric points with the old metric names after
// UI side has done their changes to support the new names.
if (
metricPoint.name === QuickPulseMetricNames.PHYSICAL_BYTES ||
metricPoint.name === QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED
) {
const oldMetricPoint: MetricPoint = {
weight: 1,
name:
metricPoint.name === QuickPulseMetricNames.PHYSICAL_BYTES
? QuickPulseMetricNames.COMMITTED_BYTES
: QuickPulseMetricNames.PROCESSOR_TIME,
value: dataPoint.value as number,
};
metricPoints.push(oldMetricPoint);
}
});
});
});
Expand Down
25 changes: 25 additions & 0 deletions sdk/monitor/monitor-opentelemetry/src/metrics/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import {
} from "./types";
import { LogRecord } from "@opentelemetry/sdk-logs";
import { Resource } from "@opentelemetry/resources";
import * as os from "os";

export function getRequestDimensions(span: ReadableSpan): Attributes {
const dimensions: MetricRequestDimensions = getBaseDimensions(span.resource);
Expand Down Expand Up @@ -171,3 +172,27 @@ export function convertDimensions(
}
return convertedDimensions as Attributes;
}

// to get physical memory bytes
export function getPhysicalMemory(): number {
return process.memoryUsage.rss();
}

// This function can get the normalized cpu, but it assumes that after this function is called,
// that the process.hrtime.bigint() & process.cpuUsage() are called/stored to be used as the
// parameters for the next call.
export function getProcessorTimeNormalized(
lastHrTime: bigint,
lastCpuUsage: NodeJS.CpuUsage,
): number {
let numCpus = os.cpus().length;
const usageDif = process.cpuUsage(lastCpuUsage);
const elapsedTimeNs = process.hrtime.bigint() - lastHrTime;

const usageDifMs = (usageDif.user + usageDif.system) / 1000.0;
const elapsedTimeMs = elapsedTimeNs === BigInt(0) ? 1 : Number(elapsedTimeNs) / 1000000.0;
// just for division safety, don't know a case in which this would actually happen
numCpus = numCpus === 0 ? 1 : numCpus;

return (usageDifMs / elapsedTimeMs / numCpus) * 100;
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ import { ExportResultCode, millisToHrTime } from "@opentelemetry/core";
import { LoggerProvider, LogRecord } from "@opentelemetry/sdk-logs";
import { LiveMetrics } from "../../../../src/metrics/quickpulse/liveMetrics";
import { InternalConfig } from "../../../../src/shared";
import { QuickPulseOpenTelemetryMetricNames } from "../../../../src/metrics/quickpulse/types";
import {
QuickPulseMetricNames,
QuickPulseOpenTelemetryMetricNames,
} from "../../../../src/metrics/quickpulse/types";
/* eslint-disable-next-line @typescript-eslint/no-redeclare */
import { Exception, RemoteDependency, Request } from "../../../../src/generated";
import { AccessToken, TokenCredential } from "@azure/core-auth";
import { resourceMetricsToQuickpulseDataPoint } from "../../../../src/metrics/quickpulse/utils";

describe("#LiveMetrics", () => {
let exportStub: sinon.SinonStub;
Expand Down Expand Up @@ -169,16 +173,16 @@ describe("#LiveMetrics", () => {
assert.ok(metrics[5].dataPoints[0].value > 0, "DEPENDENCY_FAILURE_RATE value");
assert.strictEqual(
metrics[6].descriptor.name,
QuickPulseOpenTelemetryMetricNames.COMMITTED_BYTES,
QuickPulseOpenTelemetryMetricNames.PHYSICAL_BYTES,
);
assert.strictEqual(metrics[6].dataPoints.length, 1, "dataPoints count");
assert.ok(metrics[6].dataPoints[0].value > 0, "COMMITTED_BYTES dataPoint value");
assert.ok(metrics[6].dataPoints[0].value > 0, "PHYSICAL_BYTES dataPoint value");
assert.strictEqual(
metrics[7].descriptor.name,
QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME,
QuickPulseOpenTelemetryMetricNames.PROCESSOR_TIME_NORMALIZED,
);
assert.strictEqual(metrics[7].dataPoints.length, 1, "dataPoints count");
assert.ok(metrics[7].dataPoints[0].value >= 0, "PROCESSOR_TIME dataPoint value");
assert.ok(metrics[7].dataPoints[0].value >= 0, "PROCESSOR_TIME_NORMALIZED dataPoint value");
assert.strictEqual(
metrics[8].descriptor.name,
QuickPulseOpenTelemetryMetricNames.EXCEPTION_RATE,
Expand Down Expand Up @@ -226,6 +230,38 @@ describe("#LiveMetrics", () => {
assert.equal((documents[i].properties as any)[0].key, "customAttribute");
assert.equal((documents[i].properties as any)[0].value, "test");
}

// testing that the old/new names for the perf counters appear in the monitoring data point,
// with the values of the process counters
const monitoringDataPoints = resourceMetricsToQuickpulseDataPoint(
resourceMetrics,
autoCollect["quickpulseExporter"]["baseMonitoringDataPoint"],
documents,
[],
new Map<string, number>(),
);
assert.ok(monitoringDataPoints[0].metrics?.length === 11);
assert.ok(
monitoringDataPoints[0].metrics[6].name === QuickPulseMetricNames.PHYSICAL_BYTES.toString(),
);
assert.ok(monitoringDataPoints[0].metrics[6].value > 0);
assert.ok(
monitoringDataPoints[0].metrics[7].name === QuickPulseMetricNames.COMMITTED_BYTES.toString(),
);
assert.ok(
monitoringDataPoints[0].metrics[7].value === monitoringDataPoints[0].metrics[6].value,
);
assert.ok(
monitoringDataPoints[0].metrics[8].name ===
QuickPulseMetricNames.PROCESSOR_TIME_NORMALIZED.toString(),
);
assert.ok(monitoringDataPoints[0].metrics[8].value >= 0);
assert.ok(
monitoringDataPoints[0].metrics[9].name === QuickPulseMetricNames.PROCESSOR_TIME.toString(),
);
assert.ok(
monitoringDataPoints[0].metrics[9].value === monitoringDataPoints[0].metrics[8].value,
);
});

it("should retrieve meter provider", () => {
Expand Down

0 comments on commit 3a21d19

Please sign in to comment.