diff --git a/cmd/root.go b/cmd/root.go index d6592b479..2926463b9 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -27,6 +27,7 @@ import ( "github.com/googleapis/genai-toolbox/internal/log" "github.com/googleapis/genai-toolbox/internal/server" + "github.com/googleapis/genai-toolbox/internal/telemetry" "github.com/spf13/cobra" "gopkg.in/yaml.v3" ) @@ -106,6 +107,9 @@ func NewCommand(opts ...Option) *Command { flags.StringVar(&cmd.tools_file, "tools_file", "tools.yaml", "File path specifying the tool configuration.") flags.Var(&cmd.cfg.LogLevel, "log-level", "Specify the minimum level logged. Allowed: 'DEBUG', 'INFO', 'WARN', 'ERROR'.") flags.Var(&cmd.cfg.LoggingFormat, "logging-format", "Specify logging format to use. Allowed: 'standard' or 'JSON'.") + flags.BoolVar(&cmd.cfg.TelemetryGCP, "telemetry-gcp", false, "Enable exporting directly to Google Cloud Monitoring.") + flags.StringVar(&cmd.cfg.TelemetryOTLP, "telemetry-otlp", "", "Enable exporting using OpenTelemetry Protocol (OTLP) to the specified endpoint (e.g. 'http://127.0.0.1:4318')") + flags.StringVar(&cmd.cfg.TelemetryServiceName, "telemetry-service-name", "toolbox", "Sets the value of the service.name resource attribute for telemetry data.") // wrap RunE command so that we have access to original Command object cmd.RunE = func(*cobra.Command, []string) error { return run(cmd) } @@ -173,6 +177,21 @@ func run(cmd *Command) error { return fmt.Errorf("logging format invalid.") } + // Set up OpenTelemetry + otelShutdown, err := telemetry.SetupOTel(ctx, cmd.Command.Version, cmd.cfg.TelemetryOTLP, cmd.cfg.TelemetryGCP, cmd.cfg.TelemetryServiceName) + if err != nil { + errMsg := fmt.Errorf("error setting up OpenTelemetry: %w", err) + cmd.logger.ErrorContext(ctx, errMsg.Error()) + return errMsg + } + defer func() { + err := otelShutdown(ctx) + if err != nil { + errMsg := fmt.Errorf("error shutting down OpenTelemetry: %w", err) + cmd.logger.ErrorContext(ctx, errMsg.Error()) + } + }() + // Read tool file contents buf, err := os.ReadFile(cmd.tools_file) if err != nil { diff --git a/cmd/root_test.go b/cmd/root_test.go index a3ab09d55..6a03bc9ce 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -41,6 +41,9 @@ func withDefaults(c server.ServerConfig) server.ServerConfig { if c.Port == 0 { c.Port = 5000 } + if c.TelemetryServiceName == "" { + c.TelemetryServiceName = "toolbox" + } return c } @@ -137,6 +140,27 @@ func TestServerConfigFlags(t *testing.T) { LogLevel: "WARN", }), }, + { + desc: "telemetry gcp", + args: []string{"--telemetry-gcp"}, + want: withDefaults(server.ServerConfig{ + TelemetryGCP: true, + }), + }, + { + desc: "telemetry otlp", + args: []string{"--telemetry-otlp", "http://127.0.0.1:4553"}, + want: withDefaults(server.ServerConfig{ + TelemetryOTLP: "http://127.0.0.1:4553", + }), + }, + { + desc: "telemetry service name", + args: []string{"--telemetry-service-name", "toolbox-custom"}, + want: withDefaults(server.ServerConfig{ + TelemetryServiceName: "toolbox-custom", + }), + }, } for _, tc := range tcs { t.Run(tc.desc, func(t *testing.T) { diff --git a/docs/telemetry/guide_collector.md b/docs/telemetry/guide_collector.md new file mode 100644 index 000000000..aeb00d75c --- /dev/null +++ b/docs/telemetry/guide_collector.md @@ -0,0 +1,96 @@ +# Use collector to export telemetry (trace and metric) data +Collector receives telemetry data, processes the telemetry, and exports it to a wide variety of observability backends using its components. + +## Collector +The OpenTelemetry Collector removes the need to run, operate, and maintain multiple +agents/collector. This works well with scalability and supports open source +observability data formats senidng to one or more open source or commercial +backends. In addition, collector also provide other benefits such as allowing +your service to offload data quickly while it take care of additional handling +like retries, batching, encryption, or even sensitive data filtering. + +To run a collector, you will have to provide a configuration file. The +configuration file consists of four classes of pipeline component that access +telemetry data. +- `Receivers` +- `Processors` +- `Exporters` +- `Connectors` + +Example of setting up the classes of pipeline components (in this example, we +don't use connectors): + +```yaml +receivers: + otlp: + protocols: + http: + endpoint: "127.0.0.1:4553" + +exporters: + googlecloud: + project: + +processors: + batch: + send_batch_size: 200 +``` + +After each pipeline component is configured, you will enable it within the +`service` section of the configuration file. + +```yaml +service: + pipelines: + traces: + receivers: ["otlp"] + processors: ["batch"] + exporters: ["googlecloud"] +``` + +For a conceptual overview of the Collector, see [Collector][collector]. + +[collector]: https://opentelemetry.io/docs/collector/ + +## Using a Collector +There are a couple of steps to run and use a Collector. + +1. Obtain a Collector binary. Pull a binary or Docker image for the + OpenTelemetry contrib collector. + +1. Set up credentials for telemetry backend. + +1. Set up the Collector config. + Below are some examples for setting up the Collector config: + - [Google Cloud Exporter][google-cloud-exporter] + - [Google Managed Service for Prometheus Exporter][google-prometheus-exporter] + +1. Run the Collector with the configuration file. + + ```bash + ./otelcol-contrib --config=collector-config.yaml + ``` + +1. Run toolbox with the `--telemetry-otlp` flag. Configure it to send them to + `http://127.0.0.1:4553` (for HTTP) or the Collector's URL. + + ```bash + ./toolbox --telemetry-otlp=http://127.0.0.1:4553 + ``` + +1. Once telemetry datas are collected, you can view them in your telemetry + backend. If you are using GCP exporters, telemetry will be visible in GCP + dashboard at [Metrics Explorer][metrics-explorer] and [Trace + Explorer][trace-explorer]. + +> [!NOTE] +> If you are exporting to Google Cloud monitoring, we recommend that you use +> the Google Cloud Exporter for traces and the Google Managed Service for +> Prometheus Exporter for metrics. + +[google-cloud-exporter]: + https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/googlecloudexporter +[google-prometheus-exporter]: + https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/googlemanagedprometheusexporter#example-configuration +[metrics-explorer]: https://console.cloud.google.com/monitoring/metrics-explorer +[trace-explorer]: https://console.cloud.google.com/traces diff --git a/docs/telemetry/telemetry.md b/docs/telemetry/telemetry.md new file mode 100644 index 000000000..97c55a865 --- /dev/null +++ b/docs/telemetry/telemetry.md @@ -0,0 +1,183 @@ +# Telemetry for Toolbox + +Telemetry data such as logs, metrics, and traces will help developers understand +the internal state of the system. + +Toolbox exports telemetry data of logs via standard out/err, and traces/metrics +through OpenTelemetry. Additional flags can be passed to Toolbox to enable +different logging behavior, or to export metrics through a specific +[exporter](#exporter). + + +## Logging + +### Logging format +Toolbox supports both text and structured logging format. + +The text logging (also the default logging format) outputs log as string: +``` +2024-11-12T15:08:11.451377-08:00 INFO "Initialized 0 sources.\n" +``` + +The structured logging outputs log as JSON: +``` +{ + "timestamp":"2024-11-04T16:45:11.987299-08:00", + "severity":"ERROR", + "logging.googleapis.com/sourceLocation":{...}, + "message":"unable to parse tool file at \"tools.yaml\": \"cloud-sql-postgres1\" is not a valid kind of data source" +} +``` +> [!NOTE] +> `logging.googleapis.com/sourceLocation` shows the source code location +> information associated with the log entry, if any. + +### Log level +Toolbox supports four log levels, including `Debug`, `Info`, `Warn`, +and `Error`. Toolbox will only output logs that are equal or more severe to the +level that it is set. Below are the log levels that Toolbox supports in the +order of severity. + +| **Log level** | **Description** | +|---------------|-----------------| +| Debug | Debug logs typically contain information that is only useful during the debugging phase and may be of little value during production. | +| Info | Info logs include information about successful operations within the application, such as a successful start, pause, or exit of the application. | +| Warn | Warning logs are slightly less severe than error conditions. While it does not cause an error, it indicates that an operation might fail in the future if action is not taken now. | +| Error | Error log is assigned to event logs that contain an application error message. | + +### Logging Configurations +The following flags can be used to customize Toolbox logging: + +| **Flag** | **Description** | +|----------|-----------------| +| `--log-level` | Preferred log level, allowed values: `debug`, `info`, `warn`, `error`. Default: `info`. | +| `--logging-format` | Preferred logging format, allowed values: `standard`, `json`. Default: `standard`. | + +#### Example: + +```bash +./toolbox --tools_file "tools.yaml" --log-level warn --logging-format json +``` + +## Telemetry +### Metrics +A metric is a measurement of a service captured at runtime. The collected data +can be used to provide important insights into the service. +Toolbox provides the following custom metrics: + +| **Metric Name** | **Description** | +|-----------------|-----------------| +| `toolbox.server.toolset.get.count` | Counts the number of toolset manifest requests served | +| `toolbox.server.tool.get.count` | Counts the number of tool manifest requests served | +| `toolbox.server.tool.get.invoke` | Counts the number of tool invocation requests served | + +All custom metrics have the following attributes/labels: + +| **Metric Attributes** | **Description** | +|-----------------|-----------------| +| `toolbox.name` | Name of the toolset or tool, if applicable. | +| `toolbox.status` | Operation status code, for example: `success`, `failure`. | + +### Traces +Trace is a tree of spans that shows the path that a request makes through an +application. + +Spans generated by Toolbox server is prefixed with `toolbox/server/`. For +example, when user run Toolbox, it will generate spans for the following, with +`toolbox/server/init` as the root span: + +![traces](traces.png) + +### Exporter +Exporter is responsible for processing and exporting telemetry data. Toolbox +generates telemetry data within the OpenTelemetry Protocol (OTLP), and user can +choose to use exporters that are designed to support the OpenTelemetry +Protocol. Within Toolbox, we provide two types of exporter implementation to +choose from, either the Google Cloud Exporter that will send data directly to +the backend, or the OTLP Exporter along with a Collector that will act as a +proxy to collect and export data to the telemetry backend of user's choice. + +![telemetry_flow](telemetry_flow.png) + +#### Google Cloud Exporter +The Google Cloud Exporter directly exports telemetry to Google Cloud Monitoring. +It utilizes the [GCP Metric Exporter][gcp-metric-exporter] and [GCP Trace +Exporter][gcp-trace-exporter]. + +[gcp-metric-exporter]: + https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/tree/main/exporter/metric +[gcp-trace-exporter]: + https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/tree/main/exporter/trace + +> [!NOTE] +> If you're using Google Cloud Monitoring, the following APIs will need to be +enabled. For instructions on how to enable APIs, see [this +guide](https://cloud.google.com/endpoints/docs/openapi/enable-api): +> +> - logging.googleapis.com +> - monitoring.googleapis.com +> - cloudtrace.googleapis.com + +#### OTLP Exporter +This implementation uses the default OTLP Exporter over HTTP for +[metrics][otlp-metric-exporter] and [traces][otlp-trace-exporter]. You can use +this exporter if you choose to export your telemetry data to a Collector. + +[otlp-metric-exporter]: https://opentelemetry.io/docs/languages/go/exporters/#otlp-traces-over-http +[otlp-trace-exporter]: https://opentelemetry.io/docs/languages/go/exporters/#otlp-traces-over-http + +### Collector +A collector acts as a proxy between the application and the telemetry backend. It +receives telemetry data, transforms it, and then exports data to backends that +can store it permanently. Toolbox provide an option to export telemetry data to user's choice of +backend(s) that are compatible with the Open Telemetry Protocol (OTLP). If you +would like to use a collector, please refer to this +[guide](./guide_collector.md). + +### Telemetry Configurations +The following flags are used to determine Toolbox's telemetry configuration: + +| **flag** | **type** | **description** | +|-------------------------------|----------|-----------------| +| `--telemetry-gcp` | bool | Enable exporting directly to Google Cloud Monitoring. Default is `false`. | +| `--telemetry-otlp` | string | Enable exporting using OpenTelemetry Protocol (OTLP) to the specified endpoint (e.g. 'http://127.0.0.1:4318'). | +| `--telemetry-service-name` | string | Sets the value of the `service.name` resource attribute. Default is `toolbox`. | + +In addition to the flags noted above, you can also make additional configuration +for OpenTelemetry via the [General SDK Configuration][sdk-configuration] through +environmental variables. + +[sdk-configuration]: + https://opentelemetry.io/docs/languages/sdk-configuration/general/ + +#### Example usage + +To enable Google Cloud Exporter: +```bash +./toolbox --telemetry-gcp +``` + +To enable OTLP Exporter, provide Collector endpoint: +```bash +./toolbox --telemetry-otlp=http://127.0.0.1:4553 +``` + +#### Resource Attribute +All metrics and traces generated within Toolbox will be associated with a +unified [resource][resource]. The list of resource attributes included are: + +| **Resource Name** | **Description** | +|-------------------|-----------------| +| [TelemetrySDK](https://pkg.go.dev/go.opentelemetry.io/otel/sdk/resource#WithTelemetrySDK) | TelemetrySDK version info. | +| [OS](https://pkg.go.dev/go.opentelemetry.io/otel/sdk/resource#WithOS) | OS attributes including OS description and OS type. | +| [Container](https://pkg.go.dev/go.opentelemetry.io/otel/sdk/resource#WithContainer) | Container attributes including container ID, if applicable. | +| [Host](https://pkg.go.dev/go.opentelemetry.io/otel/sdk/resource#WithHost) | Host attributes including host name. | +| [SchemaURL](https://pkg.go.dev/go.opentelemetry.io/otel/sdk/resource#WithSchemaURL) | Sets the schema URL for the configured resource. | +| `service.name` | Open telemetry service name. Defaulted to `toolbox`. User can set the service name via flag mentioned above to distinguish between different toolbox service. | +| `service.version` | The version of Toolbox used. | + + +[resource]: https://opentelemetry.io/docs/languages/go/resources/ + + + diff --git a/docs/telemetry/telemetry_flow.png b/docs/telemetry/telemetry_flow.png new file mode 100644 index 000000000..bfaf8a4f2 Binary files /dev/null and b/docs/telemetry/telemetry_flow.png differ diff --git a/docs/telemetry/traces.png b/docs/telemetry/traces.png new file mode 100644 index 000000000..4d970b30c Binary files /dev/null and b/docs/telemetry/traces.png differ diff --git a/go.mod b/go.mod index 7f1d94c0e..1bfe253ec 100644 --- a/go.mod +++ b/go.mod @@ -8,12 +8,21 @@ require ( cloud.google.com/go/alloydbconn v1.13.2 cloud.google.com/go/cloudsqlconn v1.13.2 cloud.google.com/go/spanner v1.73.0 + github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 + github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace v1.25.0 github.com/go-chi/chi/v5 v5.1.0 github.com/go-chi/httplog/v2 v2.1.1 github.com/go-chi/render v1.0.3 github.com/google/go-cmp v0.6.0 github.com/jackc/pgx/v5 v5.7.1 github.com/spf13/cobra v1.8.1 + go.opentelemetry.io/contrib/propagators/autoprop v0.58.0 + go.opentelemetry.io/otel v1.33.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 + go.opentelemetry.io/otel/metric v1.33.0 + go.opentelemetry.io/otel/sdk v1.33.0 + go.opentelemetry.io/otel/sdk/metric v1.33.0 go.opentelemetry.io/otel/trace v1.33.0 google.golang.org/api v0.211.0 gopkg.in/yaml.v3 v3.0.1 @@ -28,9 +37,12 @@ require ( cloud.google.com/go/compute/metadata v0.5.2 // indirect cloud.google.com/go/longrunning v0.6.3 // indirect cloud.google.com/go/monitoring v1.22.0 // indirect + cloud.google.com/go/trace v1.11.2 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.2 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.25.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0 // indirect github.com/ajg/form v1.5.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect @@ -44,6 +56,7 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go/v2 v2.14.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect @@ -55,10 +68,13 @@ require ( go.opentelemetry.io/contrib/detectors/gcp v1.33.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect - go.opentelemetry.io/otel v1.33.0 // indirect - go.opentelemetry.io/otel/metric v1.33.0 // indirect - go.opentelemetry.io/otel/sdk v1.33.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.33.0 // indirect + go.opentelemetry.io/contrib/propagators/aws v1.33.0 // indirect + go.opentelemetry.io/contrib/propagators/b3 v1.33.0 // indirect + go.opentelemetry.io/contrib/propagators/jaeger v1.33.0 // indirect + go.opentelemetry.io/contrib/propagators/ot v1.33.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect + go.opentelemetry.io/proto/otlp v1.4.0 // indirect + go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.31.0 // indirect golang.org/x/net v0.32.0 // indirect golang.org/x/oauth2 v0.24.0 // indirect diff --git a/go.sum b/go.sum index 981fda3e1..3718e3336 100644 --- a/go.sum +++ b/go.sum @@ -356,6 +356,8 @@ cloud.google.com/go/lifesciences v0.6.0/go.mod h1:ddj6tSX/7BOnhxCSd3ZcETvtNr8NZ6 cloud.google.com/go/lifesciences v0.8.0/go.mod h1:lFxiEOMqII6XggGbOnKiyZ7IBwoIqA84ClvoezaA/bo= cloud.google.com/go/logging v1.6.1/go.mod h1:5ZO0mHHbvm8gEmeEUHrmDlTDSu5imF6MUP9OfilNXBw= cloud.google.com/go/logging v1.7.0/go.mod h1:3xjP2CjkM3ZkO73aj4ASA5wRPGGCRrPIAeNqVNkzY8M= +cloud.google.com/go/logging v1.12.0 h1:ex1igYcGFd4S/RZWOCU51StlIEuey5bjqwH9ZYjHibk= +cloud.google.com/go/logging v1.12.0/go.mod h1:wwYBt5HlYP1InnrtYI0wtwttpVU1rifnMT7RejksUAM= cloud.google.com/go/longrunning v0.1.1/go.mod h1:UUFxuDWkv22EuY93jjmDMFT5GPQKeFVJBIF6QlTqdsE= cloud.google.com/go/longrunning v0.3.0/go.mod h1:qth9Y41RRSUE69rDcOn6DdK3HfQfsUI0YSmW3iIlLJc= cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= @@ -570,6 +572,8 @@ cloud.google.com/go/trace v1.3.0/go.mod h1:FFUE83d9Ca57C+K8rDl/Ih8LwOzWIV1krKgxg cloud.google.com/go/trace v1.4.0/go.mod h1:UG0v8UBqzusp+z63o7FK74SdFE+AXpCLdFb1rshXG+Y= cloud.google.com/go/trace v1.8.0/go.mod h1:zH7vcsbAhklH8hWFig58HvxcxyQbaIqMarMg9hn5ECA= cloud.google.com/go/trace v1.9.0/go.mod h1:lOQqpE5IaWY0Ixg7/r2SjixMuc6lfTFeO4QGM4dQWOk= +cloud.google.com/go/trace v1.11.2 h1:4ZmaBdL8Ng/ajrgKqY5jfvzqMXbrDcBsUGXOT9aqTtI= +cloud.google.com/go/trace v1.11.2/go.mod h1:bn7OwXd4pd5rFuAnTrzBuoZ4ax2XQeG3qNgYmfCy0Io= cloud.google.com/go/translate v1.3.0/go.mod h1:gzMUwRjvOqj5i69y/LYLd8RrNQk+hOmIXTi9+nb3Djs= cloud.google.com/go/translate v1.4.0/go.mod h1:06Dn/ppvLD6WvA5Rhdp029IX2Mi3Mn7fpMRLPvXT5Wg= cloud.google.com/go/translate v1.5.0/go.mod h1:29YDSYveqqpA1CQFD7NQuP49xymq17RXNaUDdc0mNu0= @@ -627,6 +631,14 @@ github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.2 h1:DBjmt6/otSdULyJdVg2 github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.2/go.mod h1:dppbR7CwXD4pgtV9t3wD1812RaLDcBjtblcDF5f1vI0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.25.0 h1:3c8yed4lgqTt+oTQ+JNMDo+F4xprBf+O/il4ZC0nRLw= github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.25.0/go.mod h1:obipzmGjfSjam60XLwGfqUkJsfiheAl+TUjG+4yzyPM= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 h1:o90wcURuxekmXrtxmYWTyNla0+ZEHhud6DI1ZTxd1vI= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0/go.mod h1:6fTWu4m3jocfUZLYF5KsZC1TUfRvEjs7lM4crme/irw= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace v1.25.0 h1:4PoDbd/9/06IpwLGxSfvfNoEr9urvfkrN6mmJangGCg= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace v1.25.0/go.mod h1:EycllQ1gupHbjqbcmfCr/H6FKSGSmEUONJ2ivb86qeY= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.49.0 h1:jJKWl98inONJAr/IZrdFQUWcwUO95DLY1XMD1ZIut+g= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.49.0/go.mod h1:l2fIqmwB+FKSfvn3bAD/0i+AXAxhIZjTK2svT/mgUXs= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0 h1:GYUJLfvd++4DMuMhCFLgLXvFwofIxh/qOwoGuS/LTew= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0/go.mod h1:wRbFgBQUVm1YXrvWKofAEmq9HNJTDphbAaJSSX01KUI= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/ajg/form v1.5.1 h1:t9c7v8JUKu/XxOGBU0yjNpaMloxGEJhUkqFRq0ibGeU= @@ -642,6 +654,8 @@ github.com/apache/arrow/go/v11 v11.0.0/go.mod h1:Eg5OsL5H+e299f7u5ssuXsuHQVEGC4x github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMrBo8f1j86j5WHzznCCQxV/b8g= @@ -844,6 +858,8 @@ github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho= @@ -977,8 +993,24 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.5 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.58.0/go.mod h1:HDBUsEjOuRC0EzKZ1bSaRGZWUBAzo+MhAcUUORSr4D0= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= +go.opentelemetry.io/contrib/propagators/autoprop v0.58.0 h1:pL1MMoBcG/ol6fVsjE1bbOO9A8GMQiN+T73hnmaXDoU= +go.opentelemetry.io/contrib/propagators/autoprop v0.58.0/go.mod h1:EU5uMoCqafsagp4hzFqzu1Eyg/8L23JS5Y1hChoHf7s= +go.opentelemetry.io/contrib/propagators/aws v1.33.0 h1:MefPfPIut0IxEiQRK1qVv5AFADBOwizl189+m7QhpFg= +go.opentelemetry.io/contrib/propagators/aws v1.33.0/go.mod h1:VB6xPo12uW/PezOqtA/cY2/DiAGYshnhID606wC9NEY= +go.opentelemetry.io/contrib/propagators/b3 v1.33.0 h1:ig/IsHyyoQ1F1d6FUDIIW5oYpsuTVtN16AyGOgdjAHQ= +go.opentelemetry.io/contrib/propagators/b3 v1.33.0/go.mod h1:EsVYoNy+Eol5znb6wwN3XQTILyjl040gUpEnUSNZfsk= +go.opentelemetry.io/contrib/propagators/jaeger v1.33.0 h1:Jok/dG8kfp+yod29XKYV/blWgYPlMuRUoRHljrXMF5E= +go.opentelemetry.io/contrib/propagators/jaeger v1.33.0/go.mod h1:ku/EpGk44S5lyVMbtJRK2KFOnXEehxf6SDnhu1eZmjA= +go.opentelemetry.io/contrib/propagators/ot v1.33.0 h1:xj/pQFKo4ROsx0v129KpLgFwaYMgFTu3dAMEEih97cY= +go.opentelemetry.io/contrib/propagators/ot v1.33.0/go.mod h1:/xxHCLhTmaypEFwMViRGROj2qgrGiFrkxIlATt0rddc= go.opentelemetry.io/otel v1.33.0 h1:/FerN9bax5LoK51X/sI0SVYrjSE0/yUL7DpxW4K3FWw= go.opentelemetry.io/otel v1.33.0/go.mod h1:SUUkR6csvUQl+yjReHu5uM3EtVV7MBm5FHKRlNx4I8I= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0 h1:bSjzTvsXZbLSWU8hnZXcKmEVaJjjnandxD0PxThhVU8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.33.0/go.mod h1:aj2rilHL8WjXY1I5V+ra+z8FELtk681deydgYT8ikxU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 h1:wpMfgF8E1rkrT1Z6meFh1NDtownE9Ii3n3X2GJYjsaU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0/go.mod h1:wAy0T/dUbs468uOlkT31xjvqQgEVXv58BRFWEgn5v/0= go.opentelemetry.io/otel/metric v1.33.0 h1:r+JOocAyeRVXD8lZpjdQjzMadVZp2M4WmQ+5WtEnklQ= go.opentelemetry.io/otel/metric v1.33.0/go.mod h1:L9+Fyctbp6HFTddIxClbQkjtubW6O9QS3Ann/M82u6M= go.opentelemetry.io/otel/sdk v1.33.0 h1:iax7M131HuAm9QkZotNHEfstof92xM+N8sr3uHXc2IM= @@ -990,6 +1022,10 @@ go.opentelemetry.io/otel/trace v1.33.0/go.mod h1:uIcdVUZMpTAmz0tI1z04GoVSezK37Cb go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= +go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= +go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= diff --git a/internal/server/api.go b/internal/server/api.go index bb1b63714..f24d25fe4 100644 --- a/internal/server/api.go +++ b/internal/server/api.go @@ -25,6 +25,9 @@ import ( "github.com/go-chi/chi/v5/middleware" "github.com/go-chi/render" "github.com/googleapis/genai-toolbox/internal/tools" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" ) // apiRouter creates a router that represents the routes under /api @@ -48,10 +51,33 @@ func apiRouter(s *Server) (chi.Router, error) { // toolsetHandler handles the request for information about a Toolset. func toolsetHandler(s *Server, w http.ResponseWriter, r *http.Request) { + ctx, span := s.instrumentation.Tracer.Start(r.Context(), "toolbox/server/toolset/get") + r = r.WithContext(ctx) + toolsetName := chi.URLParam(r, "toolsetName") + span.SetAttributes(attribute.String("toolset_name", toolsetName)) + var err error + defer func() { + if err != nil { + span.SetStatus(codes.Error, err.Error()) + } + span.End() + + status := "success" + if err != nil { + status = "error" + } + s.instrumentation.ToolsetGet.Add( + r.Context(), + 1, + metric.WithAttributes(attribute.String("toolbox.name", toolsetName)), + metric.WithAttributes(attribute.String("toolbox.operation.status", status)), + ) + }() + toolset, ok := s.toolsets[toolsetName] if !ok { - err := fmt.Errorf("Toolset %q does not exist", toolsetName) + err = fmt.Errorf("Toolset %q does not exist", toolsetName) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusNotFound)) return @@ -61,10 +87,32 @@ func toolsetHandler(s *Server, w http.ResponseWriter, r *http.Request) { // toolGetHandler handles requests for a single Tool. func toolGetHandler(s *Server, w http.ResponseWriter, r *http.Request) { + ctx, span := s.instrumentation.Tracer.Start(r.Context(), "toolbox/server/tool/get") + r = r.WithContext(ctx) + toolName := chi.URLParam(r, "toolName") + span.SetAttributes(attribute.String("tool_name", toolName)) + var err error + defer func() { + if err != nil { + span.SetStatus(codes.Error, err.Error()) + } + span.End() + + status := "success" + if err != nil { + status = "error" + } + s.instrumentation.ToolGet.Add( + r.Context(), + 1, + metric.WithAttributes(attribute.String("toolbox.name", toolName)), + metric.WithAttributes(attribute.String("toolbox.operation.status", status)), + ) + }() tool, ok := s.tools[toolName] if !ok { - err := fmt.Errorf("invalid tool name: tool with name %q does not exist", toolName) + err = fmt.Errorf("invalid tool name: tool with name %q does not exist", toolName) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusNotFound)) return @@ -82,10 +130,33 @@ func toolGetHandler(s *Server, w http.ResponseWriter, r *http.Request) { // toolInvokeHandler handles the API request to invoke a specific Tool. func toolInvokeHandler(s *Server, w http.ResponseWriter, r *http.Request) { + ctx, span := s.instrumentation.Tracer.Start(r.Context(), "toolbox/server/tool/invoke") + r = r.WithContext(ctx) + toolName := chi.URLParam(r, "toolName") + span.SetAttributes(attribute.String("tool_name", toolName)) + var err error + defer func() { + if err != nil { + span.SetStatus(codes.Error, err.Error()) + } + span.End() + + status := "success" + if err != nil { + status = "error" + } + s.instrumentation.ToolInvoke.Add( + r.Context(), + 1, + metric.WithAttributes(attribute.String("toolbox.name", toolName)), + metric.WithAttributes(attribute.String("toolbox.operation.status", status)), + ) + }() + tool, ok := s.tools[toolName] if !ok { - err := fmt.Errorf("invalid tool name: tool with name %q does not exist", toolName) + err = fmt.Errorf("invalid tool name: tool with name %q does not exist", toolName) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusNotFound)) return @@ -97,7 +168,7 @@ func toolInvokeHandler(s *Server, w http.ResponseWriter, r *http.Request) { for _, aS := range s.authSources { claims, err := aS.GetClaimsFromHeader(r.Header) if err != nil { - err := fmt.Errorf("failure getting claims from header: %w", err) + err = fmt.Errorf("failure getting claims from header: %w", err) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusBadRequest)) return @@ -119,16 +190,16 @@ func toolInvokeHandler(s *Server, w http.ResponseWriter, r *http.Request) { // Check if any of the specified auth sources is verified isAuthorized := tool.Authorized(verifiedAuthSources) if !isAuthorized { - err := fmt.Errorf("tool invocation not authorized. Please make sure your specify correct auth headers") + err = fmt.Errorf("tool invocation not authorized. Please make sure your specify correct auth headers") s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusUnauthorized)) return } var data map[string]any - if err := decodeJSON(r.Body, &data); err != nil { + if err = decodeJSON(r.Body, &data); err != nil { render.Status(r, http.StatusBadRequest) - err := fmt.Errorf("request body was invalid JSON: %w", err) + err = fmt.Errorf("request body was invalid JSON: %w", err) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusBadRequest)) return @@ -136,7 +207,7 @@ func toolInvokeHandler(s *Server, w http.ResponseWriter, r *http.Request) { params, err := tool.ParseParams(data, claimsFromAuth) if err != nil { - err := fmt.Errorf("provided parameters were invalid: %w", err) + err = fmt.Errorf("provided parameters were invalid: %w", err) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusBadRequest)) return @@ -144,7 +215,7 @@ func toolInvokeHandler(s *Server, w http.ResponseWriter, r *http.Request) { res, err := tool.Invoke(params) if err != nil { - err := fmt.Errorf("error while invoking tool: %w", err) + err = fmt.Errorf("error while invoking tool: %w", err) s.logger.DebugContext(context.Background(), err.Error()) _ = render.Render(w, r, newErrResponse(err, http.StatusInternalServerError)) return diff --git a/internal/server/api_test.go b/internal/server/api_test.go index 62eb22100..1a52b6faf 100644 --- a/internal/server/api_test.go +++ b/internal/server/api_test.go @@ -15,6 +15,7 @@ package server import ( + "context" "encoding/json" "fmt" "io" @@ -24,11 +25,14 @@ import ( "testing" "github.com/googleapis/genai-toolbox/internal/log" + "github.com/googleapis/genai-toolbox/internal/telemetry" "github.com/googleapis/genai-toolbox/internal/tools" ) var _ tools.Tool = &MockTool{} +const fakeVersionString = "0.0.0" + type MockTool struct { Name string Description string @@ -57,6 +61,9 @@ func (t MockTool) Authorized(verifiedAuthSources []string) bool { } func TestToolsetEndpoint(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + // Set up resources to test against tool1 := MockTool{ Name: "no_params", @@ -78,7 +85,7 @@ func TestToolsetEndpoint(t *testing.T) { "tool2_only": {tool2.Name}, } { tc := tools.ToolsetConfig{Name: name, ToolNames: l} - m, err := tc.Initialize("0.0.0", toolsMap) + m, err := tc.Initialize(fakeVersionString, toolsMap) if err != nil { t.Fatalf("unable to initialize toolset %q: %s", name, err) } @@ -87,9 +94,25 @@ func TestToolsetEndpoint(t *testing.T) { testLogger, err := log.NewStdLogger(os.Stdout, os.Stderr, "info") if err != nil { - t.Fatalf("unexpected error: %s", err) + t.Fatalf("unable to initialize logger: %s", err) + } + + otelShutdown, err := telemetry.SetupOTel(ctx, fakeVersionString, "", false, "toolbox") + if err != nil { + t.Fatalf("unable to setup otel: %s", err) } - server := Server{logger: testLogger, tools: toolsMap, toolsets: toolsets} + defer func() { + err := otelShutdown(ctx) + if err != nil { + t.Fatalf("error shutting down OpenTelemetry: %s", err) + } + }() + instrumentation, err := CreateTelemetryInstrumentation(fakeVersionString) + if err != nil { + t.Fatalf("unable to create custom metrics: %s", err) + } + + server := Server{logger: testLogger, instrumentation: instrumentation, tools: toolsMap, toolsets: toolsets} r, err := apiRouter(&server) if err != nil { t.Fatalf("unable to initialize router: %s", err) @@ -115,7 +138,7 @@ func TestToolsetEndpoint(t *testing.T) { toolsetName: "", want: wantResponse{ statusCode: http.StatusOK, - version: "0.0.0", + version: fakeVersionString, tools: []string{tool1.Name, tool2.Name}, }, }, @@ -132,7 +155,7 @@ func TestToolsetEndpoint(t *testing.T) { toolsetName: "tool1_only", want: wantResponse{ statusCode: http.StatusOK, - version: "0.0.0", + version: fakeVersionString, tools: []string{tool1.Name}, }, }, @@ -141,7 +164,7 @@ func TestToolsetEndpoint(t *testing.T) { toolsetName: "tool2_only", want: wantResponse{ statusCode: http.StatusOK, - version: "0.0.0", + version: fakeVersionString, tools: []string{tool2.Name}, }, }, @@ -186,6 +209,9 @@ func TestToolsetEndpoint(t *testing.T) { } } func TestToolGetEndpoint(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + // Set up resources to test against tool1 := MockTool{ Name: "no_params", @@ -202,9 +228,25 @@ func TestToolGetEndpoint(t *testing.T) { testLogger, err := log.NewStdLogger(os.Stdout, os.Stderr, "info") if err != nil { - t.Fatalf("unexpected error: %s", err) + t.Fatalf("unable to initialize logger: %s", err) } - server := Server{version: "0.0.0", logger: testLogger, tools: toolsMap} + + otelShutdown, err := telemetry.SetupOTel(ctx, fakeVersionString, "", false, "toolbox") + if err != nil { + t.Fatalf("unable to setup otel: %s", err) + } + defer func() { + err := otelShutdown(ctx) + if err != nil { + t.Fatalf("error shutting down OpenTelemetry: %s", err) + } + }() + instrumentation, err := CreateTelemetryInstrumentation(fakeVersionString) + if err != nil { + t.Fatalf("unable to create custom metrics: %s", err) + } + + server := Server{version: fakeVersionString, logger: testLogger, instrumentation: instrumentation, tools: toolsMap} r, err := apiRouter(&server) if err != nil { t.Fatalf("unable to initialize router: %s", err) @@ -230,7 +272,7 @@ func TestToolGetEndpoint(t *testing.T) { toolName: tool1.Name, want: wantResponse{ statusCode: http.StatusOK, - version: "0.0.0", + version: fakeVersionString, tools: []string{tool1.Name}, }, }, @@ -239,7 +281,7 @@ func TestToolGetEndpoint(t *testing.T) { toolName: tool2.Name, want: wantResponse{ statusCode: http.StatusOK, - version: "0.0.0", + version: fakeVersionString, tools: []string{tool2.Name}, }, }, diff --git a/internal/server/config.go b/internal/server/config.go index 64bf8f948..83e46e46f 100644 --- a/internal/server/config.go +++ b/internal/server/config.go @@ -47,8 +47,14 @@ type ServerConfig struct { ToolsetConfigs ToolsetConfigs // LoggingFormat defines whether structured loggings are used. LoggingFormat logFormat - // LogLevel defines the levels to log + // LogLevel defines the levels to log. LogLevel StringLevel + // TelemetryGCP defines whether GCP exporter is used. + TelemetryGCP bool + // TelemetryOTLP defines OTLP collector url for telemetry exports. + TelemetryOTLP string + // TelemetryServiceName defines the value of service.name resource attribute. + TelemetryServiceName string } type logFormat string diff --git a/internal/server/instrumentation.go b/internal/server/instrumentation.go new file mode 100644 index 000000000..3974aa311 --- /dev/null +++ b/internal/server/instrumentation.go @@ -0,0 +1,85 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package server + +import ( + "fmt" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +const ( + TracerName = "github.com/googleapis/genai-toolbox/internal/opentel" + MetricName = "github.com/googleapis/genai-toolbox/internal/opentel" + + toolsetGetCountName = "toolbox.server.toolset.get.count" + toolGetCountName = "toolbox.server.tool.get.count" + toolInvokeCountName = "toolbox.server.tool.invoke.count" +) + +// Instrumentation defines the telemetry instrumentation for toolbox +type Instrumentation struct { + Tracer trace.Tracer + meter metric.Meter + ToolsetGet metric.Int64Counter + ToolGet metric.Int64Counter + ToolInvoke metric.Int64Counter +} + +func CreateTelemetryInstrumentation(versionString string) (*Instrumentation, error) { + tracer := otel.Tracer( + TracerName, + trace.WithInstrumentationVersion(versionString), + ) + + meter := otel.Meter(MetricName, metric.WithInstrumentationVersion(versionString)) + toolsetGet, err := meter.Int64Counter( + toolsetGetCountName, + metric.WithDescription("Number of toolset GET API calls."), + metric.WithUnit("{call}"), + ) + if err != nil { + return nil, fmt.Errorf("unable to create %s metric: %w", toolsetGetCountName, err) + } + + toolGet, err := meter.Int64Counter( + toolGetCountName, + metric.WithDescription("Number of tool GET API calls."), + metric.WithUnit("{call}"), + ) + if err != nil { + return nil, fmt.Errorf("unable to create %s metric: %w", toolGetCountName, err) + } + + toolInvoke, err := meter.Int64Counter( + toolInvokeCountName, + metric.WithDescription("Number of tool Invoke API calls."), + metric.WithUnit("{call}"), + ) + if err != nil { + return nil, fmt.Errorf("unable to create %s metric: %w", toolInvokeCountName, err) + } + + instrumentation := &Instrumentation{ + Tracer: tracer, + meter: meter, + ToolsetGet: toolsetGet, + ToolGet: toolGet, + ToolInvoke: toolInvoke, + } + return instrumentation, nil +} diff --git a/internal/server/server.go b/internal/server/server.go index 398c8c88b..f0a4b6021 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -29,15 +29,18 @@ import ( "github.com/googleapis/genai-toolbox/internal/log" "github.com/googleapis/genai-toolbox/internal/sources" "github.com/googleapis/genai-toolbox/internal/tools" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" ) // Server contains info for running an instance of Toolbox. Should be instantiated with NewServer(). type Server struct { - version string - srv *http.Server - listener net.Listener - root chi.Router - logger log.Logger + version string + srv *http.Server + listener net.Listener + root chi.Router + logger log.Logger + instrumentation *Instrumentation sources map[string]sources.Source authSources map[string]auth.AuthSource @@ -47,6 +50,14 @@ type Server struct { // NewServer returns a Server object based on provided Config. func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, error) { + instrumentation, err := CreateTelemetryInstrumentation(cfg.Version) + if err != nil { + return nil, fmt.Errorf("unable to create telemetry instrumentation: %w", err) + } + + parentCtx, span := instrumentation.Tracer.Start(context.Background(), "toolbox/server/init") + defer span.End() + // set up http serving r := chi.NewRouter() r.Use(middleware.Recoverer) @@ -84,9 +95,22 @@ func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, er // initialize and validate the sources from configs sourcesMap := make(map[string]sources.Source) for name, sc := range cfg.SourceConfigs { - s, err := sc.Initialize() + s, err := func() (sources.Source, error) { + ctx, span := instrumentation.Tracer.Start( + parentCtx, + "toolbox/server/source/init", + trace.WithAttributes(attribute.String("source_kind", sc.SourceConfigKind())), + trace.WithAttributes(attribute.String("source_name", name)), + ) + defer span.End() + s, err := sc.Initialize(ctx, instrumentation.Tracer) + if err != nil { + return nil, fmt.Errorf("unable to initialize source %q: %w", name, err) + } + return s, nil + }() if err != nil { - return nil, fmt.Errorf("unable to initialize source %q: %w", name, err) + return nil, err } sourcesMap[name] = s } @@ -95,9 +119,22 @@ func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, er // initialize and validate the auth sources from configs authSourcesMap := make(map[string]auth.AuthSource) for name, sc := range cfg.AuthSourceConfigs { - a, err := sc.Initialize() + a, err := func() (auth.AuthSource, error) { + _, span := instrumentation.Tracer.Start( + parentCtx, + "toolbox/server/auth/init", + trace.WithAttributes(attribute.String("auth_kind", sc.AuthSourceConfigKind())), + trace.WithAttributes(attribute.String("auth_name", name)), + ) + defer span.End() + a, err := sc.Initialize() + if err != nil { + return nil, fmt.Errorf("unable to initialize auth source %q: %w", name, err) + } + return a, nil + }() if err != nil { - return nil, fmt.Errorf("unable to initialize auth source %q: %w", name, err) + return nil, err } authSourcesMap[name] = a } @@ -106,9 +143,22 @@ func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, er // initialize and validate the tools from configs toolsMap := make(map[string]tools.Tool) for name, tc := range cfg.ToolConfigs { - t, err := tc.Initialize(sourcesMap) + t, err := func() (tools.Tool, error) { + _, span := instrumentation.Tracer.Start( + parentCtx, + "toolbox/server/tool/init", + trace.WithAttributes(attribute.String("tool_kind", tc.ToolConfigKind())), + trace.WithAttributes(attribute.String("tool_name", name)), + ) + defer span.End() + t, err := tc.Initialize(sourcesMap) + if err != nil { + return nil, fmt.Errorf("unable to initialize tool %q: %w", name, err) + } + return t, nil + }() if err != nil { - return nil, fmt.Errorf("unable to initialize tool %q: %w", name, err) + return nil, err } toolsMap[name] = t } @@ -127,9 +177,21 @@ func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, er // initialize and validate the toolsets from configs toolsetsMap := make(map[string]tools.Toolset) for name, tc := range cfg.ToolsetConfigs { - t, err := tc.Initialize(cfg.Version, toolsMap) + t, err := func() (tools.Toolset, error) { + _, span := instrumentation.Tracer.Start( + parentCtx, + "toolbox/server/toolset/init", + trace.WithAttributes(attribute.String("toolset_name", name)), + ) + defer span.End() + t, err := tc.Initialize(cfg.Version, toolsMap) + if err != nil { + return tools.Toolset{}, fmt.Errorf("unable to initialize toolset %q: %w", name, err) + } + return t, err + }() if err != nil { - return nil, fmt.Errorf("unable to initialize toolset %q: %w", name, err) + return nil, err } toolsetsMap[name] = t } @@ -139,10 +201,12 @@ func NewServer(ctx context.Context, cfg ServerConfig, l log.Logger) (*Server, er srv := &http.Server{Addr: addr, Handler: r} s := &Server{ - version: cfg.Version, - srv: srv, - root: r, - logger: l, + version: cfg.Version, + srv: srv, + root: r, + logger: l, + instrumentation: instrumentation, + sources: sourcesMap, authSources: authSourcesMap, tools: toolsMap, diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 712b8925a..ca998173e 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -25,6 +25,7 @@ import ( "github.com/googleapis/genai-toolbox/internal/log" "github.com/googleapis/genai-toolbox/internal/server" + "github.com/googleapis/genai-toolbox/internal/telemetry" ) func TestServe(t *testing.T) { @@ -38,6 +39,17 @@ func TestServe(t *testing.T) { Port: port, } + otelShutdown, err := telemetry.SetupOTel(ctx, "0.0.0", "", false, "toolbox") + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + defer func() { + err := otelShutdown(ctx) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + }() + testLogger, err := log.NewStdLogger(os.Stdout, os.Stderr, "info") if err != nil { t.Fatalf("unexpected error: %s", err) diff --git a/internal/sources/alloydbpg/alloydb_pg.go b/internal/sources/alloydbpg/alloydb_pg.go index f5b939630..7e761e0c9 100644 --- a/internal/sources/alloydbpg/alloydb_pg.go +++ b/internal/sources/alloydbpg/alloydb_pg.go @@ -23,6 +23,7 @@ import ( "cloud.google.com/go/alloydbconn" "github.com/googleapis/genai-toolbox/internal/sources" "github.com/jackc/pgx/v5/pgxpool" + "go.opentelemetry.io/otel/trace" ) const SourceKind string = "alloydb-postgres" @@ -47,8 +48,8 @@ func (r Config) SourceConfigKind() string { return SourceKind } -func (r Config) Initialize() (sources.Source, error) { - pool, err := initAlloyDBPgConnectionPool(r.Project, r.Region, r.Cluster, r.Instance, r.IPType.String(), r.User, r.Password, r.Database) +func (r Config) Initialize(ctx context.Context, tracer trace.Tracer) (sources.Source, error) { + pool, err := initAlloyDBPgConnectionPool(ctx, tracer, r.Name, r.Project, r.Region, r.Cluster, r.Instance, r.IPType.String(), r.User, r.Password, r.Database) if err != nil { return nil, fmt.Errorf("unable to create pool: %w", err) } @@ -93,7 +94,11 @@ func getDialOpts(ip_type string) ([]alloydbconn.DialOption, error) { } } -func initAlloyDBPgConnectionPool(project, region, cluster, instance, ip_type, user, pass, dbname string) (*pgxpool.Pool, error) { +func initAlloyDBPgConnectionPool(ctx context.Context, tracer trace.Tracer, name, project, region, cluster, instance, ip_type, user, pass, dbname string) (*pgxpool.Pool, error) { + //nolint:all // Reassigned ctx + ctx, span := sources.InitConnectionSpan(ctx, tracer, SourceKind, name) + defer span.End() + // Configure the driver to connect to the database dsn := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", user, pass, dbname) config, err := pgxpool.ParseConfig(dsn) diff --git a/internal/sources/cloudsqlpg/cloud_sql_pg.go b/internal/sources/cloudsqlpg/cloud_sql_pg.go index 8e887565b..45f7dc9c9 100644 --- a/internal/sources/cloudsqlpg/cloud_sql_pg.go +++ b/internal/sources/cloudsqlpg/cloud_sql_pg.go @@ -23,6 +23,7 @@ import ( "cloud.google.com/go/cloudsqlconn" "github.com/googleapis/genai-toolbox/internal/sources" "github.com/jackc/pgx/v5/pgxpool" + "go.opentelemetry.io/otel/trace" ) const SourceKind string = "cloud-sql-postgres" @@ -46,8 +47,8 @@ func (r Config) SourceConfigKind() string { return SourceKind } -func (r Config) Initialize() (sources.Source, error) { - pool, err := initCloudSQLPgConnectionPool(r.Project, r.Region, r.Instance, r.IPType.String(), r.User, r.Password, r.Database) +func (r Config) Initialize(ctx context.Context, tracer trace.Tracer) (sources.Source, error) { + pool, err := initCloudSQLPgConnectionPool(ctx, tracer, r.Name, r.Project, r.Region, r.Instance, r.IPType.String(), r.User, r.Password, r.Database) if err != nil { return nil, fmt.Errorf("unable to create pool: %w", err) } @@ -92,7 +93,11 @@ func getDialOpts(ip_type string) ([]cloudsqlconn.DialOption, error) { } } -func initCloudSQLPgConnectionPool(project, region, instance, ip_type, user, pass, dbname string) (*pgxpool.Pool, error) { +func initCloudSQLPgConnectionPool(ctx context.Context, tracer trace.Tracer, name, project, region, instance, ip_type, user, pass, dbname string) (*pgxpool.Pool, error) { + //nolint:all // Reassigned ctx + ctx, span := sources.InitConnectionSpan(ctx, tracer, SourceKind, name) + defer span.End() + // Configure the driver to connect to the database dsn := fmt.Sprintf("user=%s password=%s dbname=%s sslmode=disable", user, pass, dbname) config, err := pgxpool.ParseConfig(dsn) diff --git a/internal/sources/postgres/postgres.go b/internal/sources/postgres/postgres.go index 217336fb3..9694ca67b 100644 --- a/internal/sources/postgres/postgres.go +++ b/internal/sources/postgres/postgres.go @@ -20,6 +20,7 @@ import ( "github.com/googleapis/genai-toolbox/internal/sources" "github.com/jackc/pgx/v5/pgxpool" + "go.opentelemetry.io/otel/trace" ) const SourceKind string = "postgres" @@ -41,8 +42,8 @@ func (r Config) SourceConfigKind() string { return SourceKind } -func (r Config) Initialize() (sources.Source, error) { - pool, err := initPostgresConnectionPool(r.Host, r.Port, r.User, r.Password, r.Database) +func (r Config) Initialize(ctx context.Context, tracer trace.Tracer) (sources.Source, error) { + pool, err := initPostgresConnectionPool(ctx, tracer, r.Name, r.Host, r.Port, r.User, r.Password, r.Database) if err != nil { return nil, fmt.Errorf("Unable to create pool: %w", err) } @@ -76,7 +77,10 @@ func (s *Source) PostgresPool() *pgxpool.Pool { return s.Pool } -func initPostgresConnectionPool(host, port, user, pass, dbname string) (*pgxpool.Pool, error) { +func initPostgresConnectionPool(ctx context.Context, tracer trace.Tracer, name, host, port, user, pass, dbname string) (*pgxpool.Pool, error) { + //nolint:all // Reassigned ctx + ctx, span := sources.InitConnectionSpan(ctx, tracer, SourceKind, name) + defer span.End() // urlExample := "postgres:dd//username:password@localhost:5432/database_name" i := fmt.Sprintf("postgres://%s:%s@%s:%s/%s", user, pass, host, port, dbname) pool, err := pgxpool.New(context.Background(), i) diff --git a/internal/sources/sources.go b/internal/sources/sources.go index 1c9fa9352..78c3363f5 100644 --- a/internal/sources/sources.go +++ b/internal/sources/sources.go @@ -14,13 +14,31 @@ package sources +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + // SourceConfig is the interface for configuring a source. type SourceConfig interface { SourceConfigKind() string - Initialize() (Source, error) + Initialize(ctx context.Context, tracer trace.Tracer) (Source, error) } // Source is the interface for the source itself. type Source interface { SourceKind() string } + +// InitConnectionSpan adds a span for database pool connection initialization +func InitConnectionSpan(ctx context.Context, tracer trace.Tracer, sourceKind, sourceName string) (context.Context, trace.Span) { + ctx, span := tracer.Start( + ctx, + "toolbox/server/source/connect", + trace.WithAttributes(attribute.String("source_kind", sourceKind)), + trace.WithAttributes(attribute.String("source_name", sourceName)), + ) + return ctx, span +} diff --git a/internal/sources/spanner/spanner.go b/internal/sources/spanner/spanner.go index b43dc5103..de8cfa166 100644 --- a/internal/sources/spanner/spanner.go +++ b/internal/sources/spanner/spanner.go @@ -20,6 +20,7 @@ import ( "cloud.google.com/go/spanner" "github.com/googleapis/genai-toolbox/internal/sources" + "go.opentelemetry.io/otel/trace" ) const SourceKind string = "spanner" @@ -40,8 +41,8 @@ func (r Config) SourceConfigKind() string { return SourceKind } -func (r Config) Initialize() (sources.Source, error) { - client, err := initSpannerClient(r.Project, r.Instance, r.Database) +func (r Config) Initialize(ctx context.Context, tracer trace.Tracer) (sources.Source, error) { + client, err := initSpannerClient(ctx, tracer, r.Name, r.Project, r.Instance, r.Database) if err != nil { return nil, fmt.Errorf("unable to create client: %w", err) } @@ -76,7 +77,11 @@ func (s *Source) DatabaseDialect() string { return s.Dialect } -func initSpannerClient(project, instance, dbname string) (*spanner.Client, error) { +func initSpannerClient(ctx context.Context, tracer trace.Tracer, name, project, instance, dbname string) (*spanner.Client, error) { + //nolint:all // Reassigned ctx + ctx, span := sources.InitConnectionSpan(ctx, tracer, SourceKind, name) + defer span.End() + // Configure the connection to the database db := fmt.Sprintf("projects/%s/instances/%s/databases/%s", project, instance, dbname) @@ -89,8 +94,7 @@ func initSpannerClient(project, instance, dbname string) (*spanner.Client, error } // Create spanner client - ctx := context.Background() - client, err := spanner.NewClientWithConfig(ctx, db, spanner.ClientConfig{SessionPoolConfig: sessionPoolConfig}) + client, err := spanner.NewClientWithConfig(context.Background(), db, spanner.ClientConfig{SessionPoolConfig: sessionPoolConfig}) if err != nil { return nil, fmt.Errorf("unable to create new client: %w", err) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go new file mode 100644 index 000000000..81b0d7ca8 --- /dev/null +++ b/internal/telemetry/telemetry.go @@ -0,0 +1,159 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package telemetry + +import ( + "context" + "errors" + "fmt" + + mexporter "github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric" + texporter "github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/trace" + "go.opentelemetry.io/contrib/propagators/autoprop" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + tracesdk "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" +) + +// setupOTelSDK bootstraps the OpenTelemetry pipeline. +// If it does not return an error, make sure to call shutdown for proper cleanup. +func SetupOTel(ctx context.Context, versionString, telemetryOTLP string, telemetryGCP bool, telemetryServiceName string) (shutdown func(context.Context) error, err error) { + var shutdownFuncs []func(context.Context) error + + // shutdown calls cleanup functions registered via shutdownFuncs. + // The errors from the calls are joined. + // Each registered cleanup will be invoked once. + shutdown = func(ctx context.Context) error { + var err error + for _, fn := range shutdownFuncs { + err = errors.Join(err, fn(ctx)) + } + shutdownFuncs = nil + return err + } + + // handleErr calls shutdown for cleanup and makes sure that all errors are returned. + handleErr := func(inErr error) { + err = errors.Join(inErr, shutdown(ctx)) + } + + // Configure Context Propagation to use the default W3C traceparent format. + otel.SetTextMapPropagator(autoprop.NewTextMapPropagator()) + + res, err := newResource(ctx, versionString, telemetryServiceName) + if err != nil { + errMsg := fmt.Errorf("unable to set up resource: %w", err) + handleErr(errMsg) + return + } + + tracerProvider, err := newTracerProvider(ctx, res, telemetryOTLP, telemetryGCP) + if err != nil { + errMsg := fmt.Errorf("unable to set up trace provider: %w", err) + handleErr(errMsg) + return + } + shutdownFuncs = append(shutdownFuncs, tracerProvider.Shutdown) + otel.SetTracerProvider(tracerProvider) + + meterProvider, err := newMeterProvider(ctx, res, telemetryOTLP, telemetryGCP) + if err != nil { + errMsg := fmt.Errorf("unable to set up meter provider: %w", err) + handleErr(errMsg) + return + } + shutdownFuncs = append(shutdownFuncs, meterProvider.Shutdown) + otel.SetMeterProvider(meterProvider) + + return shutdown, nil +} + +// newResource create default resources for telemetry data. +// Resource represents the entity producing telemetry. +func newResource(ctx context.Context, versionString string, telemetryServiceName string) (*resource.Resource, error) { + // Ensure default SDK resources and the required service name are set. + r, err := resource.New( + ctx, + resource.WithFromEnv(), // Discover and provide attributes from OTEL_RESOURCE_ATTRIBUTES and OTEL_SERVICE_NAME environment variables. + resource.WithTelemetrySDK(), // Discover and provide information about the OTel SDK used. + resource.WithOS(), // Discover and provide OS information. + resource.WithContainer(), // Discover and provide container information. + resource.WithHost(), //Discover and provide host information. + resource.WithSchemaURL(semconv.SchemaURL), // Set the schema url. + resource.WithAttributes( // Add other custom resource attributes. + semconv.ServiceName(telemetryServiceName), + semconv.ServiceVersion(versionString), + ), + ) + if err != nil { + return nil, fmt.Errorf("trace provider fail to set up resource: %w", err) + } + return r, nil +} + +// newTracerProvider creates TracerProvider. +// TracerProvider is a factory for Tracers and is responsible for creating spans. +func newTracerProvider(ctx context.Context, r *resource.Resource, telemetryOTLP string, telemetryGCP bool) (*tracesdk.TracerProvider, error) { + traceOpts := []tracesdk.TracerProviderOption{} + if telemetryOTLP != "" { + // otlptracehttp provides an OTLP span exporter using HTTP with protobuf payloads. + // By default, the telemetry is sent to https://localhost:4318/v1/traces. + otlpExporter, err := otlptracehttp.New(ctx, otlptracehttp.WithEndpoint(telemetryOTLP)) + if err != nil { + return nil, err + } + traceOpts = append(traceOpts, tracesdk.WithBatcher(otlpExporter)) + } + if telemetryGCP { + gcpExporter, err := texporter.New() + if err != nil { + return nil, err + } + traceOpts = append(traceOpts, tracesdk.WithBatcher(gcpExporter)) + } + traceOpts = append(traceOpts, tracesdk.WithResource(r)) + + traceProvider := tracesdk.NewTracerProvider(traceOpts...) + return traceProvider, nil +} + +// newMeterProvider creates MeterProvider. +// MeterProvider is a factory for Meters, and is responsible for creating metrics. +func newMeterProvider(ctx context.Context, r *resource.Resource, telemetryOTLP string, telemetryGCP bool) (*metric.MeterProvider, error) { + metricOpts := []metric.Option{} + if telemetryOTLP != "" { + // otlpmetrichttp provides an OTLP metrics exporter using HTTP with protobuf payloads. + // By default, the telemetry is sent to https://localhost:4318/v1/metrics. + otlpExporter, err := otlpmetrichttp.New(ctx, otlpmetrichttp.WithEndpoint(telemetryOTLP)) + if err != nil { + return nil, err + } + metricOpts = append(metricOpts, metric.WithReader(metric.NewPeriodicReader(otlpExporter))) + } + if telemetryGCP { + gcpExporter, err := mexporter.New() + if err != nil { + return nil, err + } + metricOpts = append(metricOpts, metric.WithReader(metric.NewPeriodicReader(gcpExporter))) + } + + meterProvider := metric.NewMeterProvider(metricOpts...) + return meterProvider, nil +} diff --git a/internal/tools/spanner/spanner.go b/internal/tools/spanner/spanner.go index bbb77655e..55e1204ed 100644 --- a/internal/tools/spanner/spanner.go +++ b/internal/tools/spanner/spanner.go @@ -130,8 +130,7 @@ func (t Tool) Invoke(params tools.ParamValues) (string, error) { fmt.Printf("Invoked tool %s\n", t.Name) var out strings.Builder - ctx := context.Background() - _, err = t.Client.ReadWriteTransaction(ctx, func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { + _, err = t.Client.ReadWriteTransaction(context.Background(), func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { stmt := spanner.Statement{ SQL: t.Statement, Params: mapParams,