-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prometheus Receiver metric type fixes to match Prometheus functionality #4865
Prometheus Receiver metric type fixes to match Prometheus functionality #4865
Conversation
This PR was marked stale due to lack of activity. It will be closed in 7 days. |
Hello @gracewehner, thank you for this change and great to interact with you here! I've compiled a comprehensive end-to-end test for you, but oddly it sets all the types already to Gauges for OTLP; could you please perhaps help me understand what the error is here? In here we are skipping the TYPE; please feel free to modify this to the expected result until it fails without your PR if perhaps I missed something. package internal_test
import (
"context"
"fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/exporter/otlphttpexporter"
"go.opentelemetry.io/collector/model/otlp"
"go.opentelemetry.io/collector/model/pdata"
"go.opentelemetry.io/collector/processor/batchprocessor"
"go.opentelemetry.io/collector/service"
"go.opentelemetry.io/collector/service/parserprovider"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver"
)
func TestUndefinedMetricsToGauges(t *testing.T) {
if testing.Short() {
t.Skip("This test can take a long time")
}
ctx, cancel := context.WithCancel(context.Background())
scrapeServer := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
select {
case <-ctx.Done():
return
default:
rw.Write([]byte(`
# HELP python_gc_objects_collected_total The total number of objects garbage collected
python_gc_objects_collected_total{category="old",generation="2"} 100
# HELP python_gc_objects_uncollectable_total The total number of objects that couldn't be garbage collected
python_gc_objects_uncollectable_total{category="old",generation="8"} 100
# HELP python_gc_collections_total The total of garbage collections performed.
python_gc_collections_total{prog="main.py"} 456
process_cpu_seconds_total{cpuid="8"} 19.1`))
}
}))
defer scrapeServer.Close()
metricsUnmarshaler := otlp.NewProtobufMetricsUnmarshaler()
otlpMetricsCh := make(chan *pdata.Metrics, 10)
otlpHTTPServer := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
otlpProtoBlob, err := ioutil.ReadAll(req.Body)
if err != nil {
panic(err)
}
metrics, err := metricsUnmarshaler.UnmarshalMetrics(otlpProtoBlob)
if err != nil {
panic(err)
}
otlpMetricsCh <- &metrics
}))
defer otlpHTTPServer.Close()
serverURL, err := url.Parse(scrapeServer.URL)
require.Nil(t, err)
// 3. Set the OpenTelemetry Prometheus receiver.
config := fmt.Sprintf(`
receivers:
prometheus:
config:
scrape_configs:
- job_name: 'test'
scrape_interval: 2ms
static_configs:
- targets: [%q]
processors:
batch:
exporters:
otlphttp:
endpoint: %q
insecure: true
service:
pipelines:
metrics:
receivers: [prometheus]
processors: [batch]
exporters: [otlphttp]`, serverURL.Host, otlpHTTPServer.URL)
// 4. Run the OpenTelemetry Collector.
receivers, err := component.MakeReceiverFactoryMap(prometheusreceiver.NewFactory())
require.Nil(t, err)
exporters, err := component.MakeExporterFactoryMap(otlphttpexporter.NewFactory())
require.Nil(t, err)
processors, err := component.MakeProcessorFactoryMap(batchprocessor.NewFactory())
require.Nil(t, err)
factories := component.Factories{
Receivers: receivers,
Exporters: exporters,
Processors: processors,
}
appSettings := service.CollectorSettings{
Factories: factories,
ParserProvider: parserprovider.NewInMemory(strings.NewReader(config)),
BuildInfo: component.BuildInfo{
Command: "otelcol",
Description: "OpenTelemetry Collector",
Version: "tests",
},
LoggingOptions: []zap.Option{
// Turn off the verbose logging from the collector.
zap.WrapCore(func(zapcore.Core) zapcore.Core {
return zapcore.NewNopCore()
}),
},
}
app, err := service.New(appSettings)
require.Nil(t, err)
go func() {
if err := app.Run(); err != nil {
t.Error(err)
}
}()
// Wait until the collector has actually started.
stateChannel := app.GetStateChannel()
for notYetStarted := true; notYetStarted; {
switch state := <-stateChannel; state {
case service.Running, service.Closed, service.Closing:
notYetStarted = false
}
}
defer func() {
app.Shutdown()
// Wait until we are closed.
<-stateChannel
for notClosed := true; notClosed; {
switch state := <-stateChannel; state {
case service.Closed:
notClosed = false
}
}
}()
// 5. Let's wait on 3 fetches.
var metrics []*pdata.Metrics
for i := 0; i < 3; i++ {
metrics = append(metrics, <-otlpMetricsCh)
}
defer cancel()
for _, metricL := range metrics {
rms := metricL.ResourceMetrics()
assert.True(t, rms.Len() > 0, "Expecting at least a single ResourceMetrics")
for i := 0; i < rms.Len(); i++ {
rmi := rms.At(i)
ilmL := rmi.InstrumentationLibraryMetrics()
assert.True(t, ilmL.Len() > 0, "Expecting at least a single InstrumentationLibraryMetrics")
for i := 0; i < ilmL.Len(); i++ {
ilm := ilmL.At(i)
metricL := ilm.Metrics()
assert.True(t, metricL.Len() > 0, "Expecting at least a single metric")
nonInternalMetricCount := 0
for i := 0; i < metricL.Len(); i++ {
metric := metricL.At(i)
metricName := metric.Name()
if strings.HasPrefix(metricName, "scrape_") || strings.HasPrefix(metricName, "up") {
continue
}
// Everything else should be a gauge.
assert.Equal(t, pdata.MetricDataTypeGauge, metric.DataType(), metricName+"is not a gauge")
nonInternalMetricCount++
}
require.True(t, nonInternalMetricCount > 0, "expecting at least one non-internal metric")
}
}
}
} |
Thank you @odeke-em for this end-to-end test! The reason above works is because these metrics have _total suffixes so it follows this code logic which will convert unknown types to guages: opentelemetry-collector-contrib/receiver/prometheusreceiver/internal/metricfamily.go Lines 57 to 67 in e1914a8
which sets metadata.Type = textparse.MetricTypeUnknown which is then converted to gauge in convToOCAMetricType() .
Using something like opentelemetry-collector-contrib/receiver/prometheusreceiver/internal/metricsbuilder.go Lines 247 to 267 in e1914a8
As for the issues with the python sdk metrics, I could not replicate this with the above end-to-end test so that the metadata doesn't have the _total suffix but the metric name does, so it must be something special with the python sdk, since when exposing the metrics using this and scraping them I see the _total suffix is dropped with the current collector. I have a unit test however that tests the metadata not having the _total suffix but the metric does. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one nit.
Hi @Aneurysm9 could you please take a look at this PR? We would like to get this in with the upcoming release. Thank you! |
Hi @bogdandrutu could you please take a look for this to be merged? Thank you! |
Hi @alolita could this PR be marked ready for merge? We would like to get this in for the next release. Thank you! |
This PR was marked stale due to lack of activity. It will be closed in 7 days. |
Description:
Same as open-telemetry/opentelemetry-collector#3853 but moved here with Prometheus receiver.
I also addressed the feedback to also make changes to otlp_metricfamily.go for when we switch over to those files. The current files are not caught up with what's in metricfamily.go and open-telemetry/opentelemetry-collector#3741, so some merge conflicts may need to be fixed when open-telemetry/opentelemetry-collector#3741 is merged (like uncommenting the logger line).
# TYPE
hint as UNSPECIFIED which converts to empty OTLP metrics: such as{"metrics":[{},{},{}]}
in the OTLP json output and considers these metric invalid. Prometheus, however, will treat these metrics as a Gauge type and these metrics show up correctly in the Prometheus expression browser. With these changes, if the metadata type is not found, the type is set totextparse.MetricTypeUnknown
so that in the functionconvToOCAMetricType(metadata.Type)
, the type is converted tometricspb.MetricDescriptor_GAUGE_DOUBLE
._total
to list of trimmable metric name suffixes opentelemetry-collector#3603 fixed the issue by checking the name without the _total suffix to lookup the metadata. However, the family name without the _total suffix is used as the metric name, so the metricpython_gc_objects_collected_total
is converted to the namepython_gc_objects_collected
whereas Prometheus still includes the _total suffix. This is because previously the metric family name only differed from metric names when the metric was a summary and histogram that have the metrics with _sum and _count, which are added back in later. There isn't this special logic for counters, so the family name needs to be set to the actual metric name when it is a counter and has the _total suffix.Link to tracking Issue: #4743 and open-telemetry/opentelemetry-collector#3557
Testing:
Tested out on a Kubernetes cluster with Prometheus scrape targets that have metrics without the
# TYPE
hint for (1) and metrics using the python sdk sending metrics such as below for testing (2):Changed tests for metrics without the type hint to expect the gauge type.
Added test case for situation where metadata key excludes _total suffix but metric name should still have _total suffix