From a8ee83e2c5097cb253db02fb91dee410018d0ebc Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Fri, 14 Jun 2024 11:53:56 -0700 Subject: [PATCH] health: begin work to use structured health warnings instead of strings, pipe changes into ipn.Notify (#12406) Updates tailscale/tailscale#4136 This PR is the first round of work to move from encoding health warnings as strings and use structured data instead. The current health package revolves around the idea of Subsystems. Each subsystem can have (or not have) a Go error associated with it. The overall health of the backend is given by the concatenation of all these errors. This PR polishes the concept of Warnable introduced by @bradfitz a few weeks ago. Each Warnable is a component of the backend (for instance, things like 'dns' or 'magicsock' are Warnables). Each Warnable has a unique identifying code. A Warnable is an entity we can warn the user about, by setting (or unsetting) a WarningState for it. Warnables have: - an identifying Code, so that the GUI can track them as their WarningStates come and go - a Title, which the GUIs can use to tell the user what component of the backend is broken - a Text, which is a function that is called with a set of Args to generate a more detailed error message to explain the unhappy state Additionally, this PR also begins to send Warnables and their WarningStates through LocalAPI to the clients, using ipn.Notify messages. An ipn.Notify is only issued when a warning is added or removed from the Tracker. In a next PR, we'll get rid of subsystems entirely, and we'll start using structured warnings for all errors affecting the backend functionality. Signed-off-by: Andrea Gottardo --- control/controlclient/direct.go | 13 +- health/args.go | 30 ++ health/health.go | 558 ++++++++++++++++++++-------- health/health_test.go | 133 ++++++- health/state.go | 79 ++++ health/warnings.go | 206 ++++++++++ ipn/backend.go | 11 + ipn/ipnlocal/local.go | 64 +++- net/dns/direct_linux.go | 12 +- wgengine/router/ifconfig_windows.go | 29 +- wgengine/router/router_linux.go | 17 +- 11 files changed, 939 insertions(+), 213 deletions(-) create mode 100644 health/args.go create mode 100644 health/state.go create mode 100644 health/warnings.go diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index 49879710e12cc..b305a302e5a03 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -1581,9 +1581,9 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc } // ReportHealthChange reports to the control plane a change to this node's -// health. -func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) { - if sys == health.SysOverall { +// health. w must be non-nil. us can be nil to indicate a healthy state for w. +func (c *Direct) ReportHealthChange(w *health.Warnable, us *health.UnhealthyState) { + if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable { // We don't report these. These include things like the network is down // (in which case we can't report anyway) or the user wanted things // stopped, as opposed to the more unexpected failure types in the other @@ -1602,12 +1602,13 @@ func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) { if c.panicOnUse { panic("tainted client") } + // TODO(angott): at some point, update `Subsys` in the request to be `Warnable` req := &tailcfg.HealthChangeRequest{ - Subsys: string(sys), + Subsys: string(w.Code), NodeKey: nodeKey, } - if sysErr != nil { - req.Error = sysErr.Error() + if us != nil { + req.Error = us.Text } // Best effort, no logging: diff --git a/health/args.go b/health/args.go new file mode 100644 index 0000000000000..18fe4365a7986 --- /dev/null +++ b/health/args.go @@ -0,0 +1,30 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +// Arg is a type for the key to be used in the Args of a Warnable. +type Arg string + +const ( + // ArgAvailableVersion provides an update notification Warnable with the available version of the Tailscale client. + ArgAvailableVersion Arg = "available-version" + + // ArgCurrentVersion provides an update notification Warnable with the current version of the Tailscale client. + ArgCurrentVersion Arg = "current-version" + + // ArgDuration provides a Warnable with how long the Warnable has been in an unhealthy state. + ArgDuration Arg = "duration" + + // ArgError provides a Warnable with the underlying error behind an unhealthy state. + ArgError Arg = "error" + + // ArgMagicsockFunctionName provides a Warnable with the name of the Magicsock function that caused the unhealthy state. + ArgMagicsockFunctionName Arg = "magicsock-function-name" + + // ArgRegionID provides a Warnable with the ID of a DERP server involved in the unhealthy state. + ArgRegionID Arg = "region-id" + + // ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state. + ArgServerName Arg = "server-name" +) diff --git a/health/health.go b/health/health.go index 08badf04ec03d..db3bc8e901cfc 100644 --- a/health/health.go +++ b/health/health.go @@ -8,6 +8,7 @@ package health import ( "errors" "fmt" + "maps" "net/http" "os" "runtime" @@ -67,10 +68,12 @@ type Tracker struct { mu sync.Mutex warnables []*Warnable // keys ever set - warnableVal map[*Warnable]error + warnableVal map[*Warnable]*warningState - sysErr map[Subsystem]error // subsystem => err (or nil for no error) - watchers set.HandleSet[func(Subsystem, error)] // opt func to run if error state changes + // sysErr maps subsystems to their current error (or nil if the subsystem is healthy) + // Deprecated: using Warnables should be preferred + sysErr map[Subsystem]error + watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes timer *time.Timer latestVersion *tailcfg.ClientVersion // or nil @@ -97,13 +100,12 @@ type Tracker struct { } // Subsystem is the name of a subsystem whose health can be monitored. +// +// Deprecated: Registering a Warnable using Register() and updating its health state +// with SetUnhealthy() and SetHealthy() should be preferred. type Subsystem string const ( - // SysOverall is the name representing the overall health of - // the system, rather than one particular subsystem. - SysOverall = Subsystem("overall") - // SysRouter is the name of the wgengine/router subsystem. SysRouter = Subsystem("router") @@ -120,55 +122,98 @@ const ( SysTKA = Subsystem("tailnet-lock") ) -// NewWarnable returns a new warnable item that the caller can mark as health or -// in warning state via Tracker.SetWarnable. -// -// NewWarnable is generally called in init and stored in a package global. It -// can be used by multiple Trackers. -func NewWarnable(opts ...WarnableOpt) *Warnable { - w := new(Warnable) - for _, o := range opts { - o.mod(w) +var subsystemsWarnables = map[Subsystem]*Warnable{} + +const legacyErrorArgKey = "LegacyError" + +// Warnable() returns a Warnable representing a legacy Subsystem. This is used +// *temporarily* while we migrate the old health infrastructure based on +// Subsystems to the new Warnables architecture. +func (s Subsystem) Warnable() *Warnable { + if w, ok := subsystemsWarnables[s]; ok { + return w + } else { + w := Register(&Warnable{ + Code: WarnableCode(s), + Severity: SeverityMedium, + Text: func(args Args) string { + return args[legacyErrorArgKey] + }, + }) + subsystemsWarnables[s] = w + return w } - return w } -// WarnableOpt is an option passed to NewWarnable. -type WarnableOpt interface { - mod(*Warnable) -} +var registeredWarnables = map[WarnableCode]*Warnable{} -// WithMapDebugFlag returns a WarnableOpt for NewWarnable that makes the returned -// Warnable report itself to the coordination server as broken with this -// string in MapRequest.DebugFlag when Set to a non-nil value. -func WithMapDebugFlag(name string) WarnableOpt { - return warnOptFunc(func(w *Warnable) { - w.debugFlag = name - }) +// Register registers a new Warnable with the health package and returns it. +// Register panics if the Warnable was already registered, because Warnables +// should be unique across the program. +func Register(w *Warnable) *Warnable { + if registeredWarnables[w.Code] != nil { + panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code)) + } + mak.Set(®isteredWarnables, w.Code, w) + return w } -// WithConnectivityImpact returns an option which makes a Warnable annotated as -// something that could be breaking external network connectivity on the -// machine. This will make the warnable returned by OverallError alongside -// network connectivity errors. -func WithConnectivityImpact() WarnableOpt { - return warnOptFunc(func(w *Warnable) { - w.hasConnectivityImpact = true - }) +// unregister removes a Warnable from the health package. It should only be used +// for testing purposes. +func unregister(w *Warnable) { + if registeredWarnables[w.Code] == nil { + panic(fmt.Sprintf("health: attempting to unregister Warnable %q that was not registered", w.Code)) + } + delete(registeredWarnables, w.Code) } -type warnOptFunc func(*Warnable) - -func (f warnOptFunc) mod(w *Warnable) { f(w) } - -// Warnable is a health check item that may or may not be in a bad warning state. -// The caller of NewWarnable is responsible for calling Tracker.SetWarnable to update the state. +// WarnableCode is a string that distinguishes each Warnable from others. It is globally unique within +// the program. +type WarnableCode string + +// A Warnable is something that we might want to warn the user about, or not. A Warnable is either +// in an healthy or unhealth state. A Warnable is unhealthy if the Tracker knows about a WarningState +// affecting the Warnable. +// In most cases, Warnables are components of the backend (for instance, "DNS" or "Magicsock"). +// Warnables are similar to the Subsystem type previously used in this package, but they provide +// a unique identifying code for each Warnable, along with more metadata that makes it easier for +// a GUI to display the Warnable in a user-friendly way. type Warnable struct { - debugFlag string // optional MapRequest.DebugFlag to send when unhealthy - - // If true, this warning is related to configuration of networking stack + // Code is a string that uniquely identifies this Warnable across the entire Tailscale backend, + // and can be mapped to a user-displayable localized string. + Code WarnableCode + // Title is a string that the GUI uses as title for any message involving this Warnable. The title + // should be short and fit in a single line. + Title string + // Text is a function that generates an extended string that the GUI will display to the user when + // this Warnable is in an unhealthy state. The function can use the Args map to provide dynamic + // information to the user. + Text func(args Args) string + // Severity is the severity of the Warnable, which the GUI can use to determine how to display it. + // For instance, a Warnable with SeverityHigh could trigger a modal view, while a Warnable with + // SeverityLow could be displayed in a less intrusive way. + // TODO(angott): turn this into a SeverityFunc, which allows the Warnable to change its severity based on + // the Args of the unhappy state, just like we do in the Text function. + Severity Severity + // DependsOn is a set of Warnables that this Warnable depends, on and need to be healthy + // before this Warnable can also be healthy again. The GUI can use this information to ignore + // this Warnable if one of its dependencies is unhealthy. + DependsOn []*Warnable + + // MapDebugFlag is a MapRequest.DebugFlag that is sent to control when this Warnable is unhealthy + // + // Deprecated: this is only used in one case, and will be removed in a future PR + MapDebugFlag string + + // If true, this warnable is related to configuration of networking stack // on the machine that impacts connectivity. - hasConnectivityImpact bool + ImpactsConnectivity bool +} + +// StaticMessage returns a function that always returns the input string, to be used in +// simple Warnables that do not use the Args map to generate their Text. +func StaticMessage(s string) func(Args) string { + return func(Args) string { return s } } // nil reports whether t is nil. @@ -180,6 +225,7 @@ func (t *Tracker) nil() bool { if t != nil { return false } + if cibuild.On() { stack := make([]byte, 1<<10) stack = stack[:runtime.Stack(stack, false)] @@ -191,19 +237,104 @@ func (t *Tracker) nil() bool { return true } -// Set updates the Warnable's state. -// If non-nil, it's considered unhealthy. -func (t *Tracker) SetWarnable(w *Warnable, err error) { +// Severity represents how serious an error is. Each GUI interprets this severity value in different ways, +// to surface the error in a more or less visible way. For instance, the macOS GUI could change its menubar +// icon to display an exclamation mark and present a modal notification for SeverityHigh warnings, but not +// for SeverityLow messages, which would only appear in the Settings window. +type Severity string + +const ( + SeverityHigh Severity = "high" + SeverityMedium Severity = "medium" + SeverityLow Severity = "low" +) + +// Args is a map of Args to string values that can be used to provide parameters regarding +// the unhealthy state of a Warnable. +// For instance, if you have a Warnable to track the health of DNS lookups, here you can include +// the hostname that failed to resolve, or the IP address of the DNS server that has been failing +// to respond. You can then use these parameters in the Text function of the Warnable to provide a detailed +// error message to the user. +type Args map[Arg]string + +// A warningState is a condition affecting a Warnable. For each Warnable known to the Tracker, a Warnable +// is in an unhappy state if there is a warningState associated with the Warnable. +type warningState struct { + BrokenSince time.Time // when the Warnable became unhealthy + Args Args // args can be used to provide parameters to the function that generates the Text in the Warnable +} + +func (ws *warningState) Equal(other *warningState) bool { + if ws == nil && other == nil { + return true + } + if ws == nil || other == nil { + return false + } + return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args) +} + +// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be +// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated. +// SetUnhealthy takes ownership of args. The args can be nil if no additional information is +// needed for the unhealthy state. +func (t *Tracker) SetUnhealthy(w *Warnable, args Args) { if t.nil() { return } t.mu.Lock() defer t.mu.Unlock() - l0 := len(t.warnableVal) - mak.Set(&t.warnableVal, w, err) - if len(t.warnableVal) != l0 { + t.setUnhealthyLocked(w, args) +} + +func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) { + if w == nil { + return + } + + // If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that + // BrokenSince time. + brokenSince := time.Now() + if existingWS := t.warnableVal[w]; existingWS != nil { + brokenSince = existingWS.BrokenSince + } + + if t.warnableVal[w] == nil { t.warnables = append(t.warnables, w) } + ws := &warningState{ + BrokenSince: brokenSince, + Args: args, + } + prevWs := t.warnableVal[w] + mak.Set(&t.warnableVal, w, ws) + if !ws.Equal(prevWs) { + for _, cb := range t.watchers { + go cb(w, w.unhealthyState(ws)) + } + } +} + +// SetHealthy removes any warningState for the given Warnable. +func (t *Tracker) SetHealthy(w *Warnable) { + if t.nil() { + return + } + t.mu.Lock() + defer t.mu.Unlock() + t.setHealthyLocked(w) +} + +func (t *Tracker) setHealthyLocked(w *Warnable) { + if t.warnableVal[w] == nil { + // Nothing to remove + return + } + + delete(t.warnableVal, w) + for _, cb := range t.watchers { + go cb(w, nil) + } } // AppendWarnableDebugFlags appends to base any health items that are currently in failed @@ -218,29 +349,31 @@ func (t *Tracker) AppendWarnableDebugFlags(base []string) []string { t.mu.Lock() defer t.mu.Unlock() for w, err := range t.warnableVal { - if w.debugFlag == "" { + if w.MapDebugFlag == "" { continue } if err != nil { - ret = append(ret, w.debugFlag) + ret = append(ret, w.MapDebugFlag) } } sort.Strings(ret[len(base):]) // sort the new ones return ret } -// RegisterWatcher adds a function that will be called if an -// error changes state either to unhealthy or from unhealthy. It is -// not called on transition from unknown to healthy. It must be non-nil -// and is run in its own goroutine. The returned func unregisters it. -func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { +// RegisterWatcher adds a function that will be called whenever the health state of any Warnable changes. +// If a Warnable becomes unhealthy or its unhealthy state is updated, the callback will be called with its +// current Representation. +// If a Warnable becomes healthy, the callback will be called with ws set to nil. +// The provided callback function will be executed in its own goroutine. The returned function can be used +// to unregister the callback. +func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) { if t.nil() { return func() {} } t.mu.Lock() defer t.mu.Unlock() if t.watchers == nil { - t.watchers = set.HandleSet[func(Subsystem, error)]{} + t.watchers = set.HandleSet[func(*Warnable, *UnhealthyState)]{} } handle := t.watchers.Add(cb) if t.timer == nil { @@ -258,31 +391,49 @@ func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister } // SetRouterHealth sets the state of the wgengine/router.Router. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) } // RouterHealth returns the wgengine/router.Router error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) RouterHealth() error { return t.get(SysRouter) } // SetDNSHealth sets the state of the net/dns.Manager +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) } // DNSHealth returns the net/dns.Manager error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) DNSHealth() error { return t.get(SysDNS) } // SetDNSOSHealth sets the state of the net/dns.OSConfigurator +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) } // SetDNSManagerHealth sets the state of the Linux net/dns manager's // discovery of the /etc/resolv.conf situation. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) } // DNSOSHealth returns the net/dns.OSConfigurator error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) } // SetTKAHealth sets the health of the tailnet key authority. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) } // TKAHealth returns the tailnet key authority error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) TKAHealth() error { return t.get(SysTKA) } // SetLocalLogConfigHealth sets the error state of this client's local log configuration. @@ -362,8 +513,20 @@ func (t *Tracker) setLocked(key Subsystem, err error) { } t.sysErr[key] = err t.selfCheckLocked() - for _, cb := range t.watchers { - go cb(key, err) +} + +// updateLegacyErrorWarnableLocked takes a legacy Subsystem and an optional error, and +// updates the WarningState for that legacy Subsystem, setting it to healthy or unhealthy. +// It is used temporarily while we migrate from Subsystems to Warnables. +// +// Deprecated: this function will be removed after migrating all subsystem errors to use +// Warnables instead. +func (t *Tracker) updateLegacyErrorWarnableLocked(key Subsystem, err error) { + w := key.Warnable() + if err != nil { + t.setUnhealthyLocked(key.Warnable(), Args{legacyErrorArgKey: err.Error()}) + } else { + t.setHealthyLocked(w) } } @@ -598,24 +761,7 @@ func (t *Tracker) selfCheckLocked() { // Don't check yet. return } - t.setLocked(SysOverall, t.overallErrorLocked()) -} - -// AppendWarnings appends all current health warnings to dst and returns the -// result. -func (t *Tracker) AppendWarnings(dst []string) []string { - err := t.OverallError() - if err == nil { - return dst - } - if me, ok := err.(multierr.Error); ok { - for _, err := range me.Errors() { - dst = append(dst, err.Error()) - } - } else { - dst = append(dst, err.Error()) - } - return dst + t.updateBuiltinWarnablesLocked() } // OverallError returns a summary of the health state. @@ -628,108 +774,161 @@ func (t *Tracker) OverallError() error { } t.mu.Lock() defer t.mu.Unlock() - return t.overallErrorLocked() + t.updateBuiltinWarnablesLocked() + return t.multiErrLocked() } -var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") - -// networkErrorfLocked creates an error that indicates issues with outgoing network -// connectivity. Any active warnings related to network connectivity will -// automatically be appended to it. -// -// t.mu must be held. -func (t *Tracker) networkErrorfLocked(format string, a ...any) error { - errs := []error{ - fmt.Errorf(format, a...), +// Strings() returns a string array containing the Text of all Warnings +// currently known to the Tracker. These strings can be presented to the +// user, although ideally you would use the Code property on each Warning +// to show a localized version of them instead. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) Strings() []string { + if t.nil() { + return nil } - for _, w := range t.warnables { - if !w.hasConnectivityImpact { - continue - } - if err := t.warnableVal[w]; err != nil { - errs = append(errs, err) + t.mu.Lock() + defer t.mu.Unlock() + return t.stringsLocked() +} + +func (t *Tracker) stringsLocked() []string { + result := []string{} + for w, ws := range t.warnableVal { + if ws.Args == nil { + result = append(result, w.Text(Args{})) + } else { + result = append(result, w.Text(ws.Args)) } } - if len(errs) == 1 { - return errs[0] + return result +} + +// errorsLocked returns an array of errors where each error is the Text +// of a Warning known to the Tracker. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) errorsLocked() []error { + strs := t.stringsLocked() + errs := []error{} + for _, str := range strs { + errs = append(errs, errors.New(str)) } - return multierr.New(errs...) + return errs } -var errNetworkDown = errors.New("network down") -var errNotInMapPoll = errors.New("not in map poll") -var errNoDERPHome = errors.New("no DERP home") -var errNoUDP4Bind = errors.New("no udp4 bind") -var errUnstable = errors.New("This is an unstable (development) version of Tailscale; frequent updates and bugs are likely") +// multiErrLocked returns an error listing all errors known to the Tracker. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) multiErrLocked() error { + errs := t.errorsLocked() + return multierr.New(errs...) +} -func (t *Tracker) overallErrorLocked() error { - var errs []error - add := func(err error) { - if err != nil { - errs = append(errs, err) - } - } - merged := func() error { - return multierr.New(errs...) - } +var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") +// updateBuiltinWarnablesLocked performs a number of checks on the state of the backend, +// and adds/removes Warnings from the Tracker as needed. +func (t *Tracker) updateBuiltinWarnablesLocked() { if t.checkForUpdates { if cv := t.latestVersion; cv != nil && !cv.RunningLatest && cv.LatestVersion != "" { if cv.UrgentSecurityUpdate { - add(fmt.Errorf("Security update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) + t.setUnhealthyLocked(securityUpdateAvailableWarnable, Args{ + ArgCurrentVersion: version.Short(), + ArgAvailableVersion: cv.LatestVersion, + }) } else { - add(fmt.Errorf("Update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) + t.setUnhealthyLocked(updateAvailableWarnable, Args{ + ArgCurrentVersion: version.Short(), + ArgAvailableVersion: cv.LatestVersion, + }) } } } + if version.IsUnstableBuild() { - add(errUnstable) + t.setUnhealthyLocked(unstableWarnable, Args{ + ArgCurrentVersion: version.Short(), + }) } if v, ok := t.anyInterfaceUp.Get(); ok && !v { - add(errNetworkDown) - return merged() + t.setUnhealthyLocked(NetworkStatusWarnable, nil) + return + } else { + t.setHealthyLocked(NetworkStatusWarnable) } + if t.localLogConfigErr != nil { - add(t.localLogConfigErr) - return merged() + t.setUnhealthyLocked(localLogWarnable, Args{ + ArgError: t.localLogConfigErr.Error(), + }) + return + } else { + t.setHealthyLocked(localLogWarnable) } + if !t.ipnWantRunning { - add(fmt.Errorf("state=%v, wantRunning=%v", t.ipnState, t.ipnWantRunning)) - return merged() + t.setUnhealthyLocked(IPNStateWarnable, Args{ + "State": t.ipnState, + }) + return + } else { + t.setHealthyLocked(IPNStateWarnable) } + if t.lastLoginErr != nil { - add(fmt.Errorf("not logged in, last login error=%v", t.lastLoginErr)) - return merged() + t.setUnhealthyLocked(LoginStateWarnable, Args{ + ArgError: t.lastLoginErr.Error(), + }) + return + } else { + t.setHealthyLocked(LoginStateWarnable) } + now := time.Now() if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) { - add(errNotInMapPoll) - return merged() + t.setUnhealthyLocked(notInMapPollWarnable, nil) + return + } else { + t.setHealthyLocked(notInMapPollWarnable) } + const tooIdle = 2*time.Minute + 5*time.Second if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle { - add(t.networkErrorfLocked("no map response in %v", d)) - return merged() + t.setUnhealthyLocked(mapResponseTimeoutWarnable, Args{ + ArgDuration: d.String(), + }) + return + } else { + t.setHealthyLocked(mapResponseTimeoutWarnable) } + if !t.derpHomeless { rid := t.derpHomeRegion if rid == 0 { - add(errNoDERPHome) - return merged() - } - if !t.derpRegionConnected[rid] { - add(t.networkErrorfLocked("not connected to home DERP region %v", rid)) - return merged() - } - if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { - add(t.networkErrorfLocked("haven't heard from home DERP region %v in %v", rid, d)) - return merged() + t.setUnhealthyLocked(noDERPHomeWarnable, nil) + return + } else if !t.derpRegionConnected[rid] { + t.setUnhealthyLocked(noDERPConnectionWarnable, Args{ + ArgRegionID: fmt.Sprint(rid), + }) + return + } else if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { + t.setUnhealthyLocked(derpTimeoutWarnable, Args{ + ArgRegionID: fmt.Sprint(rid), + ArgDuration: d.String(), + }) + return } } + t.setHealthyLocked(noDERPHomeWarnable) + t.setHealthyLocked(noDERPConnectionWarnable) + t.setHealthyLocked(derpTimeoutWarnable) + if t.udp4Unbound { - add(errNoUDP4Bind) - return merged() + t.setUnhealthyLocked(noUDP4BindWarnable, nil) + return + } else { + t.setHealthyLocked(noUDP4BindWarnable) } // TODO: use @@ -738,43 +937,72 @@ func (t *Tracker) overallErrorLocked() error { _ = t.lastStreamedMapResponse _ = t.lastMapRequestHeard + shouldClearMagicsockWarnings := false for i := range t.MagicSockReceiveFuncs { f := &t.MagicSockReceiveFuncs[i] if f.missing { - errs = append(errs, fmt.Errorf("%s is not running", f.name)) + t.setUnhealthyLocked(magicsockReceiveFuncWarnable, Args{ + ArgMagicsockFunctionName: f.name, + }) + shouldClearMagicsockWarnings = false } } + if shouldClearMagicsockWarnings { + t.setHealthyLocked(magicsockReceiveFuncWarnable) + } + + // Iterates over the legacy subsystems and their error, and turns them into structured errors for sys, err := range t.sysErr { - if err == nil || sys == SysOverall { - continue - } - errs = append(errs, fmt.Errorf("%v: %w", sys, err)) + t.updateLegacyErrorWarnableLocked(sys, err) } - for _, w := range t.warnables { - if err := t.warnableVal[w]; err != nil { - errs = append(errs, err) + + if len(t.derpRegionHealthProblem) > 0 { + for regionID, problem := range t.derpRegionHealthProblem { + t.setUnhealthyLocked(derpRegionErrorWarnable, Args{ + ArgRegionID: fmt.Sprint(regionID), + ArgError: problem, + }) } + } else { + t.setHealthyLocked(derpRegionErrorWarnable) } - for regionID, problem := range t.derpRegionHealthProblem { - errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem)) - } - for _, s := range t.controlHealth { - errs = append(errs, errors.New(s)) + + if len(t.controlHealth) > 0 { + for _, s := range t.controlHealth { + t.setUnhealthyLocked(controlHealthWarnable, Args{ + ArgError: s, + }) + } + } else { + t.setHealthyLocked(controlHealthWarnable) } + if err := envknob.ApplyDiskConfigError(); err != nil { - errs = append(errs, err) + t.setUnhealthyLocked(applyDiskConfigWarnable, Args{ + ArgError: err.Error(), + }) + } else { + t.setHealthyLocked(applyDiskConfigWarnable) } - for serverName, err := range t.tlsConnectionErrors { - errs = append(errs, fmt.Errorf("TLS connection error for %q: %w", serverName, err)) + + if len(t.tlsConnectionErrors) > 0 { + for serverName, err := range t.tlsConnectionErrors { + t.setUnhealthyLocked(tlsConnectionFailedWarnable, Args{ + ArgServerName: serverName, + ArgError: err.Error(), + }) + } + } else { + t.setHealthyLocked(tlsConnectionFailedWarnable) } - if e := fakeErrForTesting(); len(errs) == 0 && e != "" { - return errors.New(e) + + if e := fakeErrForTesting(); len(t.warnables) == 0 && e != "" { + t.setUnhealthyLocked(testWarnable, Args{ + ArgError: e, + }) + } else { + t.setHealthyLocked(testWarnable) } - sort.Slice(errs, func(i, j int) bool { - // Not super efficient (stringifying these in a sort), but probably max 2 or 3 items. - return errs[i].Error() < errs[j].Error() - }) - return multierr.New(errs...) } // ReceiveFuncStats tracks the calls made to a wireguard-go receive func. diff --git a/health/health_test.go b/health/health_test.go index f6673d324e9be..ff4cddb70ab90 100644 --- a/health/health_test.go +++ b/health/health_test.go @@ -4,19 +4,23 @@ package health import ( - "errors" "fmt" "reflect" "testing" + "time" ) func TestAppendWarnableDebugFlags(t *testing.T) { var tr Tracker for i := range 10 { - w := NewWarnable(WithMapDebugFlag(fmt.Sprint(i))) + w := Register(&Warnable{ + Code: WarnableCode(fmt.Sprintf("warnable-code-%d", i)), + MapDebugFlag: fmt.Sprint(i), + }) + defer unregister(w) if i%2 == 0 { - tr.SetWarnable(w, errors.New("boom")) + tr.SetUnhealthy(w, Args{"test-arg": fmt.Sprint(i)}) } } @@ -49,3 +53,126 @@ func TestNilMethodsDontCrash(t *testing.T) { rv.Method(i).Call(args) } } + +func TestSetUnhealthyWithDuplicateThenHealthyAgain(t *testing.T) { + ht := Tracker{} + if len(ht.Strings()) != 0 { + t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } + + ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 1"}) + want := []string{"Hello world 1"} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after calling SetUnhealthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } + + // Adding a second warning state with the same WarningCode overwrites the existing warning state, + // the count shouldn't have changed. + ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 2"}) + want = []string{"Hello world 2"} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after insertion of same WarningCode, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } + + ht.SetHealthy(testWarnable) + want = []string{} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after setting the healthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } +} + +func TestRemoveAllWarnings(t *testing.T) { + ht := Tracker{} + if len(ht.Strings()) != 0 { + t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } + + ht.SetUnhealthy(testWarnable, Args{"Text": "Hello world 1"}) + if len(ht.Strings()) != 1 { + t.Fatalf("after first insertion, len(newTracker.Strings) = %d; want = %d", len(ht.Strings()), 1) + } + + ht.SetHealthy(testWarnable) + if len(ht.Strings()) != 0 { + t.Fatalf("after RemoveAll, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } +} + +// TestWatcher tests that a registered watcher function gets called with the correct +// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy. +func TestWatcher(t *testing.T) { + ht := Tracker{} + wantText := "Hello world" + becameUnhealthy := make(chan struct{}) + becameHealthy := make(chan struct{}) + + watcherFunc := func(w *Warnable, us *UnhealthyState) { + if w != testWarnable { + t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, testWarnable) + } + + if us != nil { + if us.Text != wantText { + t.Fatalf("unexpected us.Text: %s, want: %s", us.Text, wantText) + } + if us.Args[ArgError] != wantText { + t.Fatalf("unexpected us.Args[ArgError]: %s, want: %s", us.Args[ArgError], wantText) + } + becameUnhealthy <- struct{}{} + } else { + becameHealthy <- struct{}{} + } + } + + unregisterFunc := ht.RegisterWatcher(watcherFunc) + if len(ht.watchers) != 1 { + t.Fatalf("after RegisterWatcher, len(newTracker.watchers) = %d; want = 1", len(ht.watchers)) + } + ht.SetUnhealthy(testWarnable, Args{ArgError: wantText}) + + select { + case <-becameUnhealthy: + // Test passed because the watcher got notified of an unhealthy state + case <-becameHealthy: + // Test failed because the watcher got of a healthy state instead of an unhealthy one + t.Fatalf("watcherFunc was called with a healthy state") + case <-time.After(1 * time.Second): + t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy") + } + + ht.SetHealthy(testWarnable) + + select { + case <-becameUnhealthy: + // Test failed because the watcher got of an unhealthy state instead of a healthy one + t.Fatalf("watcherFunc was called with an unhealthy state") + case <-becameHealthy: + // Test passed because the watcher got notified of a healthy state + case <-time.After(1 * time.Second): + t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy") + } + + unregisterFunc() + if len(ht.watchers) != 0 { + t.Fatalf("after unregisterFunc, len(newTracker.watchers) = %d; want = 0", len(ht.watchers)) + } +} + +func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) { + w := &Warnable{ + Code: "test-warnable-1", + } + + Register(w) + defer unregister(w) + if registeredWarnables[w.Code] != w { + t.Fatalf("after Register, registeredWarnables[%s] = %v; want = %v", w.Code, registeredWarnables[w.Code], w) + } + + defer func() { + if r := recover(); r == nil { + t.Fatalf("Registering the same Warnable twice didn't panic") + } + }() + Register(w) +} diff --git a/health/state.go b/health/state.go new file mode 100644 index 0000000000000..1b12898eb98ba --- /dev/null +++ b/health/state.go @@ -0,0 +1,79 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +import ( + "time" +) + +// State contains the health status of the backend, and is +// provided to the client UI via LocalAPI through ipn.Notify. +type State struct { + // Each key-value pair in Warnings represents a Warnable that is currently + // unhealthy. If a Warnable is healthy, it will not be present in this map. + // When a Warnable is unhealthy and becomes healthy, its key-value pair + // disappears in the next issued State. Observers should treat the absence of + // a WarnableCode in this map as an indication that the Warnable became healthy, + // and may use that to clear any notifications that were previously shown to the user. + // If Warnings is nil, all Warnables are healthy and the backend is overall healthy. + Warnings map[WarnableCode]UnhealthyState +} + +// Representation contains information to be shown to the user to inform them +// that a Warnable is currently unhealthy. +type UnhealthyState struct { + WarnableCode WarnableCode + Severity Severity + Title string + Text string + BrokenSince *time.Time `json:",omitempty"` + Args Args `json:",omitempty"` +} + +// unhealthyState returns a unhealthyState of the Warnable given its current warningState. +func (w *Warnable) unhealthyState(ws *warningState) *UnhealthyState { + var text string + if ws.Args != nil { + text = w.Text(ws.Args) + } else { + text = w.Text(Args{}) + } + + return &UnhealthyState{ + WarnableCode: w.Code, + Severity: w.Severity, + Title: w.Title, + Text: text, + BrokenSince: &ws.BrokenSince, + Args: ws.Args, + } +} + +// CurrentState returns a snapshot of the current health status of the backend. +// It returns a State with nil Warnings if the backend is healthy (all Warnables +// have no issues). +// The returned State is a snapshot of shared memory, and the caller should not +// mutate the returned value. +func (t *Tracker) CurrentState() *State { + if t.nil() { + return &State{} + } + + t.mu.Lock() + defer t.mu.Unlock() + + if t.warnableVal == nil || len(t.warnableVal) == 0 { + return &State{} + } + + wm := map[WarnableCode]UnhealthyState{} + + for w, ws := range t.warnableVal { + wm[w.Code] = *w.unhealthyState(ws) + } + + return &State{ + Warnings: wm, + } +} diff --git a/health/warnings.go b/health/warnings.go new file mode 100644 index 0000000000000..fc769e6861d48 --- /dev/null +++ b/health/warnings.go @@ -0,0 +1,206 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +import ( + "fmt" +) + +/** +This file contains definitions for the Warnables maintained within this `health` package. +*/ + +// updateAvailableWarnable is a Warnable that warns the user that an update is available. +var updateAvailableWarnable = Register(&Warnable{ + Code: "update-available", + Title: "Update available", + Severity: SeverityLow, + Text: func(args Args) string { + return fmt.Sprintf("An update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update.", args[ArgCurrentVersion], args[ArgAvailableVersion]) + }, +}) + +// securityUpdateAvailableWarnable is a Warnable that warns the user that an important security update is available. +var securityUpdateAvailableWarnable = Register(&Warnable{ + Code: "security-update-available", + Title: "Security update available", + Severity: SeverityHigh, + Text: func(args Args) string { + return fmt.Sprintf("An urgent security update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update now.", args[ArgCurrentVersion], args[ArgAvailableVersion]) + }, +}) + +// unstableWarnable is a Warnable that warns the user that they are using an unstable version of Tailscale +// so they won't be surprised by all the issues that may arise. +var unstableWarnable = Register(&Warnable{ + Code: "is-using-unstable-version", + Title: "Using an unstable version", + Severity: SeverityLow, + Text: StaticMessage("This is an unstable version of Tailscale meant for testing and development purposes: please report any bugs to Tailscale."), +}) + +// NetworkStatusWarnable is a Warnable that warns the user that the network is down. +var NetworkStatusWarnable = Register(&Warnable{ + Code: "network-status", + Title: "Network down", + Severity: SeverityHigh, + Text: StaticMessage("Tailscale cannot connect because the network is down. (No network interface is up.)"), + ImpactsConnectivity: true, +}) + +// IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped. +var IPNStateWarnable = Register(&Warnable{ + Code: "wantrunning-false", + Title: "Not connected to Tailscale", + Severity: SeverityLow, + Text: StaticMessage("Tailscale is stopped."), +}) + +// localLogWarnable is a Warnable that warns the user that the local log is misconfigured. +var localLogWarnable = Register(&Warnable{ + Code: "local-log-config-error", + Title: "Local log misconfiguration", + Severity: SeverityLow, + Text: func(args Args) string { + return fmt.Sprintf("The local log is misconfigured: %v", args[ArgError]) + }, +}) + +// LoginStateWarnable is a Warnable that warns the user that they are logged out, +// and provides the last login error if available. +var LoginStateWarnable = Register(&Warnable{ + Code: "login-state", + Title: "Logged out", + Severity: SeverityMedium, + Text: func(args Args) string { + if args[ArgError] != "" { + return fmt.Sprintf("You are logged out. The last login error was: %v", args[ArgError]) + } else { + return "You are logged out." + } + }, +}) + +// notInMapPollWarnable is a Warnable that warns the user that they cannot connect to the control server. +var notInMapPollWarnable = Register(&Warnable{ + Code: "not-in-map-poll", + Title: "Cannot connect to control server", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Cannot connect to the control server (not in map poll). Check your Internet connection."), +}) + +// noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP. +var noDERPHomeWarnable = Register(&Warnable{ + Code: "no-derp-home", + Title: "No home relay server", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."), +}) + +// noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server. +var noDERPConnectionWarnable = Register(&Warnable{ + Code: "no-derp-connection", + Title: "No relay server connection", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale could not connect to the relay server '%s'. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID]) + }, +}) + +// derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while. +var derpTimeoutWarnable = Register(&Warnable{ + Code: "derp-timed-out", + Title: "Relay server timed out", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale hasn't heard from the home relay server (region %v) in %v. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID], args[ArgDuration]) + }, +}) + +// derpRegionErrorWarnable is a Warnable that warns the user that a DERP region is reporting an issue. +var derpRegionErrorWarnable = Register(&Warnable{ + Code: "derp-region-error", + Title: "Relay server error", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("The relay server #%v is reporting an issue: %v", args[ArgRegionID], args[ArgError]) + }, +}) + +// noUDP4BindWarnable is a Warnable that warns the user that Tailscale couldn't listen for incoming UDP connections. +var noUDP4BindWarnable = Register(&Warnable{ + Code: "no-udp4-bind", + Title: "Incoming connections may fail", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Tailscale couldn't listen for incoming UDP connections."), + ImpactsConnectivity: true, +}) + +// mapResponseTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't received a network map from the coordination server in a while. +var mapResponseTimeoutWarnable = Register(&Warnable{ + Code: "mapresponse-timeout", + Title: "Network map response timeout", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale hasn't received a network map from the coordination server in %s.", args[ArgDuration]) + }, +}) + +// tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server. +var tlsConnectionFailedWarnable = Register(&Warnable{ + Code: "tls-connection-failed", + Title: "Encrypted connection failed", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale could not establish an encrypted connection with '%q': %v", args[ArgServerName], args[ArgError]) + }, +}) + +// magicsockReceiveFuncWarnable is a Warnable that warns the user that one of the Magicsock functions is not running. +var magicsockReceiveFuncWarnable = Register(&Warnable{ + Code: "magicsock-receive-func-error", + Title: "MagicSock function not running", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("The MagicSock function %s is not running. You might experience connectivity issues.", args[ArgMagicsockFunctionName]) + }, +}) + +// testWarnable is a Warnable that is used within this package for testing purposes only. +var testWarnable = Register(&Warnable{ + Code: "test-warnable", + Title: "Test warnable", + Severity: SeverityLow, + Text: func(args Args) string { + return args[ArgError] + }, +}) + +// applyDiskConfigWarnable is a Warnable that warns the user that there was an error applying the envknob config stored on disk. +var applyDiskConfigWarnable = Register(&Warnable{ + Code: "apply-disk-config", + Title: "Could not apply configuration", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("An error occurred applying the Tailscale envknob configuration stored on disk: %v", args[ArgError]) + }, +}) + +// controlHealthWarnable is a Warnable that warns the user that the coordination server is reporting an health issue. +var controlHealthWarnable = Register(&Warnable{ + Code: "control-health", + Title: "Coordination server reports an issue", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("The coordination server is reporting an health issue: %v", args[ArgError]) + }, +}) diff --git a/ipn/backend.go b/ipn/backend.go index 6408d7e98668f..d6ba954089372 100644 --- a/ipn/backend.go +++ b/ipn/backend.go @@ -9,6 +9,7 @@ import ( "time" "tailscale.com/drive" + "tailscale.com/health" "tailscale.com/ipn/ipnstate" "tailscale.com/tailcfg" "tailscale.com/types/empty" @@ -70,6 +71,8 @@ const ( NotifyNoPrivateKeys // if set, private keys that would normally be sent in updates are zeroed out NotifyInitialDriveShares // if set, the first Notify message (sent immediately) will contain the current Taildrive Shares NotifyInitialOutgoingFiles // if set, the first Notify message (sent immediately) will contain the current Taildrop OutgoingFiles + + NotifyInitialHealthState // if set, the first Notify message (sent immediately) will contain the current health.State of the client ) // Notify is a communication from a backend (e.g. tailscaled) to a frontend @@ -138,6 +141,11 @@ type Notify struct { // empty value means that there are no shares. DriveShares views.SliceView[*drive.Share, drive.ShareView] + // Health is the last-known health state of the backend. When this field is + // non-nil, a change in health verified, and the API client should surface + // any changes to the user in the UI. + Health *health.State `json:",omitempty"` + // type is mirrored in xcode/Shared/IPN.swift } @@ -177,6 +185,9 @@ func (n Notify) String() string { if n.LocalTCPPort != nil { fmt.Fprintf(&sb, "tcpport=%v ", n.LocalTCPPort) } + if n.Health != nil { + sb.WriteString("Health{...} ") + } s := sb.String() return s[0:len(s)-1] + "}" } diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 213a00e212936..c219a391ea42e 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -666,12 +666,18 @@ func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) { } } -func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) { - if err == nil { - b.logf("health(%q): ok", sys) +func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthyState) { + if us == nil { + b.logf("health(warnable=%s): ok", w.Code) } else { - b.logf("health(%q): error: %v", sys, err) + b.logf("health(warnable=%s): error: %s", w.Code, us.Text) } + + // Whenever health changes, send the current health state to the frontend. + state := b.health.CurrentState() + b.send(ipn.Notify{ + Health: state, + }) } // Shutdown halts the backend and all its sub-components. The backend @@ -788,7 +794,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) { if prefs := b.pm.CurrentPrefs(); prefs.Valid() && prefs.AutoUpdate().Check { s.ClientVersion = b.lastClientVersion } - s.Health = b.health.AppendWarnings(s.Health) + s.Health = b.health.Strings() s.HaveNodeKey = b.hasNodeKeyLocked() // TODO(bradfitz): move this health check into a health.Warnable @@ -1870,7 +1876,13 @@ func (b *LocalBackend) Start(opts ipn.Options) error { return nil } -var warnInvalidUnsignedNodes = health.NewWarnable() +// invalidPacketFilterWarnable is a Warnable to warn the user that the control server sent an invalid packet filter. +var invalidPacketFilterWarnable = health.Register(&health.Warnable{ + Code: "invalid-packet-filter", + Title: "Invalid packet filter", + Severity: health.SeverityHigh, + Text: health.StaticMessage("The coordination server sent an invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety"), +}) // updateFilterLocked updates the packet filter in wgengine based on the // given netMap and user preferences. @@ -1902,11 +1914,10 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P packetFilter = netMap.PacketFilter if packetFilterPermitsUnlockedNodes(b.peers, packetFilter) { - err := errors.New("server sent invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety") - b.health.SetWarnable(warnInvalidUnsignedNodes, err) + b.health.SetUnhealthy(invalidPacketFilterWarnable, nil) packetFilter = nil } else { - b.health.SetWarnable(warnInvalidUnsignedNodes, nil) + b.health.SetHealthy(invalidPacketFilterWarnable) } } if prefs.Valid() { @@ -2309,6 +2320,9 @@ func (b *LocalBackend) WatchNotifications(ctx context.Context, mask ipn.NotifyWa if mask&ipn.NotifyInitialDriveShares != 0 && b.driveSharingEnabledLocked() { ini.DriveShares = b.pm.prefs.DriveShares() } + if mask&ipn.NotifyInitialHealthState != 0 { + ini.Health = b.HealthTracker().CurrentState() + } } mak.Set(&b.notifyWatchers, sessionID, &watchSession{ch, sessionID}) @@ -3120,20 +3134,31 @@ func (b *LocalBackend) isDefaultServerLocked() bool { return prefs.ControlURLOrDefault() == ipn.DefaultControlURL } -var warnExitNodeUsage = health.NewWarnable(health.WithConnectivityImpact()) +var exitNodeMisconfigurationWarnable = health.Register(&health.Warnable{ + Code: "exit-node-misconfiguration", + Title: "Exit node misconfiguration", + Severity: health.SeverityMedium, + Text: func(args health.Args) string { + return "Exit node misconfiguration: " + args[health.ArgError] + }, +}) // updateExitNodeUsageWarning updates a warnable meant to notify users of // configuration issues that could break exit node usage. -func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, health *health.Tracker) { - var result error +func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, healthTracker *health.Tracker) { + var msg string if p.ExitNodeIP().IsValid() || p.ExitNodeID() != "" { warn, _ := netutil.CheckReversePathFiltering(state) const comment = "please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310" if len(warn) > 0 { - result = fmt.Errorf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment) + msg = fmt.Sprintf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment) } } - health.SetWarnable(warnExitNodeUsage, result) + if len(msg) > 0 { + healthTracker.SetUnhealthy(exitNodeMisconfigurationWarnable, health.Args{health.ArgError: msg}) + } else { + healthTracker.SetHealthy(exitNodeMisconfigurationWarnable) + } } func (b *LocalBackend) checkExitNodePrefsLocked(p *ipn.Prefs) error { @@ -5841,13 +5866,18 @@ func (b *LocalBackend) sshServerOrInit() (_ SSHServer, err error) { return b.sshServer, nil } -var warnSSHSELinux = health.NewWarnable() +var warnSSHSELinuxWarnable = health.Register(&health.Warnable{ + Code: "ssh-unavailable-selinux-enabled", + Title: "Tailscale SSH and SELinux", + Severity: health.SeverityLow, + Text: health.StaticMessage("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux"), +}) func (b *LocalBackend) updateSELinuxHealthWarning() { if hostinfo.IsSELinuxEnforcing() { - b.health.SetWarnable(warnSSHSELinux, errors.New("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux")) + b.health.SetUnhealthy(warnSSHSELinuxWarnable, nil) } else { - b.health.SetWarnable(warnSSHSELinux, nil) + b.health.SetHealthy(warnSSHSELinuxWarnable) } } diff --git a/net/dns/direct_linux.go b/net/dns/direct_linux.go index 1de22e1b940c5..319462170387b 100644 --- a/net/dns/direct_linux.go +++ b/net/dns/direct_linux.go @@ -6,7 +6,6 @@ package dns import ( "bytes" "context" - "errors" "github.com/illarion/gonotify" "tailscale.com/health" @@ -58,7 +57,12 @@ func (m *directManager) runFileWatcher() { } } -var warnTrample = health.NewWarnable() +var resolvTrampleWarnable = health.Register(&health.Warnable{ + Code: "resolv-conf-overwritten", + Severity: health.SeverityMedium, + Title: "Linux DNS configuration issue", + Text: health.StaticMessage("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight"), +}) // checkForFileTrample checks whether /etc/resolv.conf has been trampled // by another program on the system. (e.g. a DHCP client) @@ -78,7 +82,7 @@ func (m *directManager) checkForFileTrample() { return } if bytes.Equal(cur, want) { - m.health.SetWarnable(warnTrample, nil) + m.health.SetHealthy(resolvTrampleWarnable) if lastWarn != nil { m.mu.Lock() m.lastWarnContents = nil @@ -101,7 +105,7 @@ func (m *directManager) checkForFileTrample() { show = show[:1024] } m.logf("trample: resolv.conf changed from what we expected. did some other program interfere? current contents: %q", show) - m.health.SetWarnable(warnTrample, errors.New("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight")) + m.health.SetUnhealthy(resolvTrampleWarnable, nil) } func (m *directManager) closeInotifyOnDone(ctx context.Context, in *gonotify.Inotify) { diff --git a/wgengine/router/ifconfig_windows.go b/wgengine/router/ifconfig_windows.go index b9b1873b8f381..779acc881da6a 100644 --- a/wgengine/router/ifconfig_windows.go +++ b/wgengine/router/ifconfig_windows.go @@ -14,17 +14,18 @@ import ( "sort" "time" - ole "github.com/go-ole/go-ole" - "github.com/tailscale/wireguard-go/tun" - "go4.org/netipx" - "golang.org/x/sys/windows" - "golang.zx2c4.com/wireguard/windows/tunnel/winipcfg" "tailscale.com/health" "tailscale.com/net/netmon" "tailscale.com/net/tsaddr" "tailscale.com/net/tstun" "tailscale.com/util/multierr" "tailscale.com/wgengine/winnet" + + ole "github.com/go-ole/go-ole" + "github.com/tailscale/wireguard-go/tun" + "go4.org/netipx" + "golang.org/x/sys/windows" + "golang.zx2c4.com/wireguard/windows/tunnel/winipcfg" ) // monitorDefaultRoutes subscribes to route change events and updates @@ -235,9 +236,17 @@ func interfaceFromLUID(luid winipcfg.LUID, flags winipcfg.GAAFlags) (*winipcfg.I return nil, fmt.Errorf("interfaceFromLUID: interface with LUID %v not found", luid) } -var networkCategoryWarning = health.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy")) - -func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker) (retErr error) { +var networkCategoryWarnable = health.Register(&health.Warnable{ + Code: "set-network-category-failed", + Severity: health.SeverityMedium, + Title: "Windows network configuration failed", + Text: func(args health.Args) string { + return fmt.Sprintf("Failed to set the network category to private on the Tailscale adapter. This may prevent Tailscale from working correctly. Error: %s", args[health.ArgError]) + }, + MapDebugFlag: "warn-network-category-unhealthy", +}) + +func configureInterface(cfg *Config, tun *tun.NativeTun, ht *health.Tracker) (retErr error) { var mtu = tstun.DefaultTUNMTU() luid := winipcfg.LUID(tun.LUID()) iface, err := interfaceFromLUID(luid, @@ -268,10 +277,10 @@ func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker) for i := range tries { found, err := setPrivateNetwork(luid) if err != nil { - health.SetWarnable(networkCategoryWarning, fmt.Errorf("set-network-category: %w", err)) + ht.SetUnhealthy(networkCategoryWarnable, health.Args{health.ArgError: err.Error()}) log.Printf("setPrivateNetwork(try=%d): %v", i, err) } else { - health.SetWarnable(networkCategoryWarning, nil) + ht.SetHealthy(networkCategoryWarnable) if found { if i > 0 { log.Printf("setPrivateNetwork(try=%d): success", i) diff --git a/wgengine/router/router_linux.go b/wgengine/router/router_linux.go index a328075d99d0f..2af73e26d2f28 100644 --- a/wgengine/router/router_linux.go +++ b/wgengine/router/router_linux.go @@ -445,12 +445,17 @@ func (r *linuxRouter) Set(cfg *Config) error { return multierr.New(errs...) } -var warnStatefulFilteringWithDocker = health.NewWarnable() +var dockerStatefulFilteringWarnable = health.Register(&health.Warnable{ + Code: "docker-stateful-filtering", + Title: "Docker with stateful filtering", + Severity: health.SeverityMedium, + Text: health.StaticMessage("Stateful filtering is enabled and Docker was detected; this may prevent Docker containers on this host from resolving DNS and connecting to Tailscale nodes. See https://tailscale.com/s/stateful-docker"), +}) func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) { // If stateful filtering is disabled, clear the warning. if !r.statefulFiltering { - r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) + r.health.SetHealthy(dockerStatefulFilteringWarnable) return } @@ -479,17 +484,13 @@ func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) { // socket/daemon/etc. ifstate := r.netMon.InterfaceState() if _, found := ifstate.Interface["docker0"]; found { - r.health.SetWarnable(warnStatefulFilteringWithDocker, fmt.Errorf(""+ - "Stateful filtering is enabled and Docker was detected; this may prevent Docker containers "+ - "on this host from resolving DNS and connecting to Tailscale nodes. "+ - "See https://tailscale.com/s/stateful-docker", - )) + r.health.SetUnhealthy(dockerStatefulFilteringWarnable, nil) return } } // If we get here, then we have no warnings; clear anything existing. - r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) + r.health.SetHealthy(dockerStatefulFilteringWarnable) } // UpdateMagicsockPort implements the Router interface.