Skip to content

Commit

Permalink
Reduce metrics resources (#123)
Browse files Browse the repository at this point in the history
This PR:
- Removes `location` from the `shadowsocks_tcp_probes` time series, which is the largest one.
- Removes 0s from `shadowsocks_data_bytes`.  We were creating 4 entries (each proxy leg) for every country that attempted to connect, even if they were unsuccessful connections. The metric was dominated by zeros.
- Reduces the cardinality of `shadowsocks_data_bytes` by removing the status and location. We don't need that for data limits. To enable country usage tracking, I introduce `shadowsocks_data_bytes_per_location`. To enable status tracking for UDP, I introduce `shadowsocks_udp_packets_from_client_per_location`. TCP already had `shadowsocks_connections_closed` for status tracking.
  • Loading branch information
fortuna authored Oct 20, 2022
1 parent aa13697 commit fbc1b4f
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 44 deletions.
3 changes: 0 additions & 3 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
* @Jigsaw-Code/outline-networking-owners

*.md @Jigsaw-Code/outline-strings-owners
LICENSE @Jigsaw-Code/outline-strings-owners
4 changes: 0 additions & 4 deletions service/PROBES.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,3 @@ This feature is on by default in Outline. Admins who are using outline-ss-serve
Shadowsocks uses the same Key Derivation Function for both upstream and downstream flows, so in principle an attacker could record data sent from the server to the client, and use it in a "reflected replay" attack as simulated client->server data. The data would appear to be valid and authenticated to the server, but the connection would most likely fail when attempting to parse the destination address header, perhaps leading to a distinctive failure behavior.

To avoid this class of attacks, outline-ss-server uses an [HMAC](https://en.wikipedia.org/wiki/HMAC) with a 32-bit tag to mark all server handshakes, and checks for the presence of this tag in all incoming handshakes. If the tag is present, the connection is a reflected replay, with a false positive probability of 1 in 4 billion.

## Metrics

Outline provides server operators with metrics on a variety of aspects of server activity, including any detected attacks. To observe attacks detected by your server, look at the `tcp_probes` histogram vector in Prometheus. The `status` field will be `"ERR_CIPHER"` (indicating invalid probe data), `"ERR_REPLAY_CLIENT"`, or `"ERR_REPLAY_SERVER"`, depending on the kind of attack your server observed. You can also see what country each probe appeared to originate from, and approximately how many bytes were sent before giving up.
87 changes: 56 additions & 31 deletions service/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type ShadowsocksMetrics interface {
// TCP metrics
AddOpenTCPConnection(clientLocation string)
AddClosedTCPConnection(clientLocation, accessKey, status string, data ProxyMetrics, timeToCipher, duration time.Duration)
AddTCPProbe(clientLocation, status, drainResult string, port int, data ProxyMetrics)
AddTCPProbe(status, drainResult string, port int, data ProxyMetrics)

// UDP metrics
AddUDPPacketFromClient(clientLocation, accessKey, status string, clientProxyBytes, proxyTargetBytes int, timeToCipher time.Duration)
Expand All @@ -50,23 +50,26 @@ type ShadowsocksMetrics interface {
type shadowsocksMetrics struct {
ipCountryDB *geoip2.Reader

buildInfo *prometheus.GaugeVec
accessKeys prometheus.Gauge
ports prometheus.Gauge
dataBytes *prometheus.CounterVec
timeToCipherMs *prometheus.HistogramVec
buildInfo *prometheus.GaugeVec
accessKeys prometheus.Gauge
ports prometheus.Gauge
dataBytes *prometheus.CounterVec
dataBytesPerLocation *prometheus.CounterVec
timeToCipherMs *prometheus.HistogramVec
// TODO: Add time to first byte.

tcpProbes *prometheus.HistogramVec
tcpOpenConnections *prometheus.CounterVec
tcpClosedConnections *prometheus.CounterVec
tcpConnectionDurationMs *prometheus.HistogramVec

udpAddedNatEntries prometheus.Counter
udpRemovedNatEntries prometheus.Counter
udpPacketsFromClientPerLocation *prometheus.CounterVec
udpAddedNatEntries prometheus.Counter
udpRemovedNatEntries prometheus.Counter
}

func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics {
// Don't forget to pass the counters to the registerer.MustRegister call in NewPrometheusShadowsocksMetrics.
return &shadowsocksMetrics{
ipCountryDB: ipCountryDB,
buildInfo: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand All @@ -84,6 +87,12 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics {
Name: "ports",
Help: "Count of open Shadowsocks ports",
}),
tcpProbes: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "shadowsocks",
Name: "tcp_probes",
Buckets: []float64{0, 49, 50, 51, 73, 91},
Help: "Histogram of number of bytes from client to proxy, for detecting possible probes",
}, []string{"port", "status", "error"}),
tcpOpenConnections: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "shadowsocks",
Subsystem: "tcp",
Expand Down Expand Up @@ -115,21 +124,28 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics {
prometheus.CounterOpts{
Namespace: "shadowsocks",
Name: "data_bytes",
Help: "Bytes transferred by the proxy",
}, []string{"dir", "proto", "location", "status", "access_key"}),
tcpProbes: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "shadowsocks",
Name: "tcp_probes",
Buckets: []float64{0, 49, 50, 51, 73, 91},
Help: "Histogram of number of bytes from client to proxy, for detecting possible probes",
}, []string{"location", "port", "status", "error"}),
Help: "Bytes transferred by the proxy, per access key",
}, []string{"dir", "proto", "access_key"}),
dataBytesPerLocation: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "shadowsocks",
Name: "data_bytes_per_location",
Help: "Bytes transferred by the proxy, per location",
}, []string{"dir", "proto", "location"}),
timeToCipherMs: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "shadowsocks",
Name: "time_to_cipher_ms",
Help: "Time needed to find the cipher",
Buckets: []float64{0.1, 1, 10, 100, 1000},
}, []string{"proto", "found_key"}),
udpPacketsFromClientPerLocation: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "shadowsocks",
Subsystem: "udp",
Name: "packets_from_client_per_location",
Help: "Packets received from the client, per location and status",
}, []string{"location", "status"}),
udpAddedNatEntries: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "shadowsocks",
Expand All @@ -154,8 +170,8 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics {
func NewPrometheusShadowsocksMetrics(ipCountryDB *geoip2.Reader, registerer prometheus.Registerer) ShadowsocksMetrics {
m := newShadowsocksMetrics(ipCountryDB)
// TODO: Is it possible to pass where to register the collectors?
registerer.MustRegister(m.buildInfo, m.accessKeys, m.ports, m.tcpOpenConnections, m.tcpProbes, m.tcpClosedConnections, m.tcpConnectionDurationMs,
m.dataBytes, m.timeToCipherMs, m.udpAddedNatEntries, m.udpRemovedNatEntries)
registerer.MustRegister(m.buildInfo, m.accessKeys, m.ports, m.tcpProbes, m.tcpOpenConnections, m.tcpClosedConnections, m.tcpConnectionDurationMs,
m.dataBytes, m.dataBytesPerLocation, m.timeToCipherMs, m.udpPacketsFromClientPerLocation, m.udpAddedNatEntries, m.udpRemovedNatEntries)
return m
}

Expand Down Expand Up @@ -216,35 +232,44 @@ func isFound(accessKey string) string {
}

// addIfNonZero helps avoid the creation of series that are always zero.
func addIfNonZero(counter prometheus.Counter, value int64) {
func addIfNonZero(value int64, counterVec *prometheus.CounterVec, lvs ...string) {
if value > 0 {
counter.Add(float64(value))
counterVec.WithLabelValues(lvs...).Add(float64(value))
}
}

func (m *shadowsocksMetrics) AddClosedTCPConnection(clientLocation, accessKey, status string, data ProxyMetrics, timeToCipher, duration time.Duration) {
m.tcpClosedConnections.WithLabelValues(clientLocation, status, accessKey).Inc()
m.tcpConnectionDurationMs.WithLabelValues(status).Observe(duration.Seconds() * 1000)
m.timeToCipherMs.WithLabelValues("tcp", isFound(accessKey)).Observe(timeToCipher.Seconds() * 1000)
addIfNonZero(m.dataBytes.WithLabelValues("c>p", "tcp", clientLocation, status, accessKey), data.ClientProxy)
addIfNonZero(m.dataBytes.WithLabelValues("p>t", "tcp", clientLocation, status, accessKey), data.ProxyTarget)
addIfNonZero(m.dataBytes.WithLabelValues("p<t", "tcp", clientLocation, status, accessKey), data.TargetProxy)
addIfNonZero(m.dataBytes.WithLabelValues("c<p", "tcp", clientLocation, status, accessKey), data.ProxyClient)
addIfNonZero(data.ClientProxy, m.dataBytes, "c>p", "tcp", accessKey)
addIfNonZero(data.ClientProxy, m.dataBytesPerLocation, "c>p", "tcp", clientLocation)
addIfNonZero(data.ProxyTarget, m.dataBytes, "p>t", "tcp", accessKey)
addIfNonZero(data.ProxyTarget, m.dataBytesPerLocation, "p>t", "tcp", clientLocation)
addIfNonZero(data.TargetProxy, m.dataBytes, "p<t", "tcp", accessKey)
addIfNonZero(data.TargetProxy, m.dataBytesPerLocation, "p<t", "tcp", clientLocation)
addIfNonZero(data.ProxyClient, m.dataBytes, "c<p", "tcp", accessKey)
addIfNonZero(data.ProxyClient, m.dataBytesPerLocation, "c<p", "tcp", clientLocation)
}

func (m *shadowsocksMetrics) AddTCPProbe(clientLocation, status, drainResult string, port int, data ProxyMetrics) {
m.tcpProbes.WithLabelValues(clientLocation, strconv.Itoa(port), status, drainResult).Observe(float64(data.ClientProxy))
func (m *shadowsocksMetrics) AddTCPProbe(status, drainResult string, port int, data ProxyMetrics) {
m.tcpProbes.WithLabelValues(strconv.Itoa(port), status, drainResult).Observe(float64(data.ClientProxy))
}

func (m *shadowsocksMetrics) AddUDPPacketFromClient(clientLocation, accessKey, status string, clientProxyBytes, proxyTargetBytes int, timeToCipher time.Duration) {
m.timeToCipherMs.WithLabelValues("udp", isFound(accessKey)).Observe(timeToCipher.Seconds() * 1000)
addIfNonZero(m.dataBytes.WithLabelValues("c>p", "udp", clientLocation, status, accessKey), int64(clientProxyBytes))
addIfNonZero(m.dataBytes.WithLabelValues("p>t", "udp", clientLocation, status, accessKey), int64(proxyTargetBytes))
m.udpPacketsFromClientPerLocation.WithLabelValues(clientLocation, status).Inc()
addIfNonZero(int64(clientProxyBytes), m.dataBytes, "c>p", "udp", accessKey)
addIfNonZero(int64(clientProxyBytes), m.dataBytesPerLocation, "c>p", "udp", clientLocation)
addIfNonZero(int64(proxyTargetBytes), m.dataBytes, "p>t", "udp", accessKey)
addIfNonZero(int64(proxyTargetBytes), m.dataBytesPerLocation, "p>t", "udp", clientLocation)
}

func (m *shadowsocksMetrics) AddUDPPacketFromTarget(clientLocation, accessKey, status string, targetProxyBytes, proxyClientBytes int) {
addIfNonZero(m.dataBytes.WithLabelValues("p<t", "udp", clientLocation, status, accessKey), int64(targetProxyBytes))
addIfNonZero(m.dataBytes.WithLabelValues("c<p", "udp", clientLocation, status, accessKey), int64(proxyClientBytes))
addIfNonZero(int64(targetProxyBytes), m.dataBytes, "p<t", "udp", accessKey)
addIfNonZero(int64(targetProxyBytes), m.dataBytesPerLocation, "p<t", "udp", clientLocation)
addIfNonZero(int64(proxyClientBytes), m.dataBytes, "c<p", "udp", accessKey)
addIfNonZero(int64(proxyClientBytes), m.dataBytesPerLocation, "c<p", "udp", clientLocation)
}

func (m *shadowsocksMetrics) AddUDPNatEntry() {
Expand Down Expand Up @@ -310,7 +335,7 @@ func MeasureConn(conn onet.DuplexConn, bytesSent, bytesReceived *int64) onet.Dup
type NoOpMetrics struct{}

func (m *NoOpMetrics) SetBuildInfo(version string) {}
func (m *NoOpMetrics) AddTCPProbe(clientLocation, status, drainResult string, port int, data ProxyMetrics) {
func (m *NoOpMetrics) AddTCPProbe(status, drainResult string, port int, data ProxyMetrics) {
}
func (m *NoOpMetrics) AddClosedTCPConnection(clientLocation, accessKey, status string, data ProxyMetrics, timeToCipher, duration time.Duration) {
}
Expand Down
5 changes: 2 additions & 3 deletions service/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func TestMethodsDontPanic(t *testing.T) {
ssMetrics.SetNumAccessKeys(20, 2)
ssMetrics.AddOpenTCPConnection("US")
ssMetrics.AddClosedTCPConnection("US", "1", "OK", proxyMetrics, 10*time.Millisecond, 100*time.Millisecond)
ssMetrics.AddTCPProbe("US", "ERR_CIPHER", "eof", 443, proxyMetrics)
ssMetrics.AddTCPProbe("ERR_CIPHER", "eof", 443, proxyMetrics)
ssMetrics.AddUDPPacketFromClient("US", "2", "OK", 10, 20, 10*time.Millisecond)
ssMetrics.AddUDPPacketFromTarget("US", "3", "OK", 10, 20)
ssMetrics.AddUDPNatEntry()
Expand Down Expand Up @@ -73,14 +73,13 @@ func BenchmarkCloseTCP(b *testing.B) {

func BenchmarkProbe(b *testing.B) {
ssMetrics := NewPrometheusShadowsocksMetrics(nil, prometheus.NewRegistry())
clientLocation := "ZZ"
status := "ERR_REPLAY"
drainResult := "other"
port := 12345
data := ProxyMetrics{}
b.ResetTimer()
for i := 0; i < b.N; i++ {
ssMetrics.AddTCPProbe(clientLocation, status, drainResult, port, data)
ssMetrics.AddTCPProbe(status, drainResult, port, data)
}
}

Expand Down
2 changes: 1 addition & 1 deletion service/tcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ func (s *tcpService) absorbProbe(listenerPort int, clientConn io.ReadCloser, cli
_, drainErr := io.Copy(ioutil.Discard, clientConn) // drain socket
drainResult := drainErrToString(drainErr)
logger.Debugf("Drain error: %v, drain result: %v", drainErr, drainResult)
s.m.AddTCPProbe(clientLocation, status, drainResult, listenerPort, *proxyMetrics)
s.m.AddTCPProbe(status, drainResult, listenerPort, *proxyMetrics)
}

func drainErrToString(drainErr error) string {
Expand Down
2 changes: 1 addition & 1 deletion service/tcp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ type probeTestMetrics struct {
closeStatus []string
}

func (m *probeTestMetrics) AddTCPProbe(clientLocation, status, drainResult string, port int, data metrics.ProxyMetrics) {
func (m *probeTestMetrics) AddTCPProbe(status, drainResult string, port int, data metrics.ProxyMetrics) {
m.mu.Lock()
m.probeData = append(m.probeData, data)
m.probeStatus = append(m.probeStatus, status)
Expand Down
2 changes: 1 addition & 1 deletion service/udp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ type natTestMetrics struct {
upstreamPackets []udpReport
}

func (m *natTestMetrics) AddTCPProbe(clientLocation, status, drainResult string, port int, data metrics.ProxyMetrics) {
func (m *natTestMetrics) AddTCPProbe(status, drainResult string, port int, data metrics.ProxyMetrics) {
}
func (m *natTestMetrics) AddClosedTCPConnection(clientLocation, accessKey, status string, data metrics.ProxyMetrics, timeToCipher, duration time.Duration) {
}
Expand Down

0 comments on commit fbc1b4f

Please sign in to comment.