Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add explicit warning when an overlap is detected #2929

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -881,12 +881,22 @@ type chunkSecretKey struct {
detectorKey ahocorasick.DetectorKey
}

func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) bool {
const similarityThreshold = 0.9
const similarityThreshold = 0.9

valStr := val.secret
func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) (bool, detectorspb.DetectorType) {
var (
detectorType detectorspb.DetectorType
valStr = val.secret
valRedacted string
)
if len(valStr) < 3 {
return false, detectorType
} else {
valRedacted = valStr[:3] + "..."
}
for dupeKey := range dupes {
dupe := dupeKey.secret
detectorType = dupeKey.detectorKey.Type()
// Avoid comparing strings of vastly different lengths.
if len(dupe)*10 < len(valStr)*9 || len(dupe)*10 > len(valStr)*11 {
continue
Expand All @@ -899,25 +909,42 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec
}

if valStr == dupe {
ctx.Logger().V(2).Info(
ctx.Logger().V(1).Info(
"found exact duplicate",
"val", valRedacted,
"val_detector", val.detectorKey.Type(),
"dupe_detector", dupeKey.detectorKey.Type(),
)
return true
return true, detectorType
}

similarity := strutil.Similarity(valStr, dupe, metrics.NewLevenshtein())

// close enough
if similarity > similarityThreshold {
ctx.Logger().V(2).Info(
ctx.Logger().V(1).Info(
"found similar duplicate",
"val", valRedacted,
"val_detector", val.detectorKey.Type(),
"dupe_detector", dupeKey.detectorKey.Type(),
)
return true
return true, detectorType
}
}
return false
return false, detectorType
}

type detectorOverlapKey struct {
DetectorA detectorspb.DetectorType
DetectorB detectorspb.DetectorType
}

func (d detectorOverlapKey) Equal(other detectorOverlapKey) bool {
return (d.DetectorA == other.DetectorA && d.DetectorB == other.DetectorB) || (d.DetectorA == other.DetectorB && d.DetectorB == other.DetectorA)
}

var detectorOverlaps = make(map[detectorOverlapKey]struct{})

func (e *Engine) verificationOverlapWorker(ctx context.Context) {
var wgDetect sync.WaitGroup

Expand Down Expand Up @@ -976,7 +1003,20 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
continue
}

if likelyDuplicate(ctx, key, chunkSecrets) {
if ok, t := likelyDuplicate(ctx, key, chunkSecrets); ok {
Copy link
Contributor Author

@rgmz rgmz Jun 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I don't understand the threat model for this feature. It silently skips verification if a result matches more than one detector, which happens frequently during normal operation and can result in valid secrets being skipped.

The notion of a "malicious" detector is confusing — is Truffle adding malicious detectors to TruffleHog? Either this check needs a whitelist of common overlaps (e.g., Azure-related, URL and FTP), or it should only trigger between built-in and custom detectors (IMO).

// Record the overlap between detectors.
overlapKey := detectorOverlapKey{
key.detectorKey.Type(),
t,
}
if _, ok := detectorOverlaps[overlapKey]; !ok {
detectorOverlaps[overlapKey] = struct{}{}
ctx.Logger().Info(
"WARNING: A result will not be verified because more than one detector matches. "+
"You can override this behavior by using the --allow-verification-overlap flag",
"detectors", []string{overlapKey.DetectorA.String(), overlapKey.DetectorB.String()})
}

// This indicates that the same secret was found by multiple detectors.
// We should NOT VERIFY this chunk's data.
if e.verificationOverlapTracker != nil {
Expand Down
4 changes: 2 additions & 2 deletions pkg/engine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ func TestLikelyDuplicate(t *testing.T) {
name: "empty strings",
val: chunkSecretKey{"", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{{"", detectorB.Key}: {}},
expected: true,
expected: false,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should empty strings be considered duplicates? I guess it makes sense, but I'd initially tweaked the logic for likelyDuplicate to ignore anything smaller than 3 characters.

},
{
name: "similar within threshold same detector",
Expand All @@ -957,7 +957,7 @@ func TestLikelyDuplicate(t *testing.T) {
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
result := likelyDuplicate(ctx, tc.val, tc.dupes)
result, _ := likelyDuplicate(ctx, tc.val, tc.dupes)
if result != tc.expected {
t.Errorf("expected %v, got %v", tc.expected, result)
}
Expand Down