DataDog · gbbr · Jan 28, 2019 · Jan 24, 2019 · Jan 28, 2019 · LotharSee
diff --git a/pkg/trace/agent/tags.go b/pkg/trace/agent/tags.go
@@ -1,7 +1,6 @@
 package agent
 
 import (
-	"bytes"
 	"sort"
 	"strings"
 	"unicode"
@@ -251,54 +250,91 @@ func FilterTags(tags, groups []string) []string {
 // backend requirements
 // taken from dd-go.model.NormalizeTag
 func NormalizeTag(tag string) string {
-	// unless you just throw out unicode, this is already as fast as it gets
-
-	buf := bytes.NewBuffer(make([]byte, 0, 2*len(tag)))
-	lastWasUnderscore := false
-
-	for _, c := range tag {
-		// fast path for len check
-		if buf.Len() >= maxTagLength {
+	var (
+		trim   int      // start character (if trimming)
+		wiping bool     // true when the previous character has been discarded
+		wipe   [][2]int // sections to discard: (start, end) pairs
+		chars  int      // number of characters processed
+	)
+	var (
+		i int  // current byte
+		c rune // current rune
+	)
+	norm := []byte(tag)
+	for i, c = range tag {
+		if chars >= maxTagLength {
+			// we've reached the maximum
 			break
 		}
-		// fast path for ascii alphabetic chars
+		// fast path; all letters are ok
 		switch {
 		case c >= 'a' && c <= 'z':
-			buf.WriteRune(c)
-			lastWasUnderscore = false
+			chars++
+			wiping = false
 			continue
 		case c >= 'A' && c <= 'Z':
-			c -= 'A' - 'a'
-			buf.WriteRune(c)
-			lastWasUnderscore = false
+			// lower-case
+			norm[i] += 'a' - 'A'
+			chars++
+			wiping = false
 			continue
 		}
 
 		c = unicode.ToLower(c)
 		switch {
-		// handle always valid cases
 		case unicode.IsLetter(c) || c == ':':
-			buf.WriteRune(c)
-			lastWasUnderscore = false
-		// skip any characters that can't start the string
-		case buf.Len() == 0:
+			chars++
+			wiping = false
+		case chars == 0:
+			// this character can not start the string, trim
+			trim = i + 1
 			continue
-		// handle valid characters that can't start the string.
 		case unicode.IsDigit(c) || c == '.' || c == '/' || c == '-':
-			buf.WriteRune(c)
-			lastWasUnderscore = false
-		// convert anything else to underscores (including underscores), but only allow one in a row.
-		case !lastWasUnderscore:
-			buf.WriteRune('_')
-			lastWasUnderscore = true
+			chars++
+			wiping = false
+		default:
+			// illegal character
+			if !wiping {
+				// start a new cut
+				wipe = append(wipe, [2]int{i, i + 1})
+				wiping = true
+			} else {
+				// lengthen current cut
+				wipe[len(wipe)-1][1]++
+			}
 		}
 	}
 
-	// strip trailing underscores
-	if lastWasUnderscore {
-		b := buf.Bytes()
-		return string(b[:len(b)-1])
+	norm = norm[trim : i+1] // trim start and end
+	if len(wipe) == 0 {
+		// tag was ok, return it as it is
+		return string(norm)
 	}
+	delta := trim // cut offsets delta
+	for _, cut := range wipe {
+		// start and end of cut, including delta from previous cuts:
+		start, end := cut[0]-delta, cut[1]-delta
+
+		if end >= len(norm) {
+			// this cut includes the end of the string; discard it
+			// completely and finish the loop.
+			norm = norm[:start]
+			break
+		}
+		// replace the beginning of the cut with '_'
+		norm[start] = '_'
+		if end-start == 1 {
+			// nothing to discard
+			continue
+		}
+		// discard remaining characters in the cut
+		copy(norm[start+1:], norm[end:])
 
-	return buf.String()
+		// shorten the slice
+		norm = norm[:len(norm)-(end-start)+1]
+
+		// count the new delta for future cuts
+		delta += cut[1] - cut[0] - 1
+	}
+	return string(norm)
 }
diff --git a/pkg/trace/agent/tags_test.go b/pkg/trace/agent/tags_test.go
@@ -145,3 +145,41 @@ func TestTagSetKey(t *testing.T) {
 	ts := NewTagSetFromString("a:b,a:b:c,abc")
 	assert.Equal(t, ":abc,a:b,a:b:c", ts.Key())
 }
+
+func TestNormalizeTag(t *testing.T) {
+	for _, tt := range []struct{ in, out string }{
+		{in: "ok", out: "ok"},
+		{in: "AlsO:ök", out: "also:ök"},
+		{in: ":still_ok", out: ":still_ok"},
+		{in: "___trim", out: "trim"},
+		{in: "12.:trim@", out: ":trim"},
+		{in: "12.:trim@@", out: ":trim"},
+		{in: "fun:ky__tag/1", out: "fun:ky_tag/1"},
+		{in: "fun:ky@tag/2", out: "fun:ky_tag/2"},
+		{in: "fun:ky@@@tag/3", out: "fun:ky_tag/3"},
+		{in: "tag:1/2.3", out: "tag:1/2.3"},
+		{in: "---fun:k####y_ta@#g/1_@@#", out: "fun:k_y_ta_g/1"},
+		{in: "AlsO:œ#@ö))œk", out: "also:œ_ö_œk"},
+	} {
+		t.Run("", func(t *testing.T) {
+			assert.Equal(t, tt.out, NormalizeTag(tt.in), tt.in)
+		})
+	}
+}
+
+func benchNormalizeTag(tag string) func(b *testing.B) {
+	return func(b *testing.B) {
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			NormalizeTag(tag)
+		}
+	}
+}
+
+func BenchmarkNormalizeTag(b *testing.B) {
+	b.Run("ok", benchNormalizeTag("good_tag"))
+	b.Run("trim", benchNormalizeTag("___trim_left"))
+	b.Run("trim-both", benchNormalizeTag("___trim_right@@#!"))
+	b.Run("plenty", benchNormalizeTag("fun:ky_ta@#g/1"))
+	b.Run("more", benchNormalizeTag("fun:k####y_ta@#g/1_@@#"))
+}
diff --git a/releasenotes/notes/improve-performance-of-NormalizeTag-function-7eba70c13f0bdad7.yaml b/releasenotes/notes/improve-performance-of-NormalizeTag-function-7eba70c13f0bdad7.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    APM: improve performance of NormalizeTag function.