Merge pull request #4936 from onflow/petera/4798-improve-compare-logging

[Access] Improve script exec compare logging
onflow · Nov 8, 2023 · 1990e2f · 1990e2f
2 parents fe6714e + 74d9e4e
commit 1990e2f
Showing 1 changed file with 57 additions and 23 deletions.
diff --git a/engine/access/rpc/backend/backend_scripts.go b/engine/access/rpc/backend/backend_scripts.go
@@ -102,33 +102,35 @@ func (b *backendScripts) executeScript(
 
 	switch b.scriptExecMode {
 	case ScriptExecutionModeExecutionNodesOnly:
-		return b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
+		result, _, err := b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
+		return result, err
 
 	case ScriptExecutionModeLocalOnly:
-		return b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
+		result, _, err := b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
+		return result, err
 
 	case ScriptExecutionModeFailover:
-		localResult, localErr := b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
+		localResult, localDuration, localErr := b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
 		if localErr == nil || isInvalidArgumentError(localErr) {
 			return localResult, localErr
 		}
-		execResult, execErr := b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
+		execResult, execDuration, execErr := b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
 
-		b.compareScriptExecutionResults(execResult, execErr, localResult, localErr, blockID, script, insecureScriptHash)
+		b.compareScriptExecutionResults(execResult, execDuration, execErr, localResult, localDuration, localErr, blockID, script, arguments, insecureScriptHash)
 
 		return execResult, execErr
 
 	case ScriptExecutionModeCompare:
-		execResult, execErr := b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
+		execResult, execDuration, execErr := b.executeScriptOnAvailableExecutionNodes(ctx, blockID, script, arguments, insecureScriptHash)
 		// we can only compare the results if there were either no errors or a cadence error
 		// since we cannot distinguish the EN error as caused by the block being pruned or some other reason,
 		// which may produce a valid RN output but an error for the EN
 		if execErr != nil && !isInvalidArgumentError(execErr) {
 			return nil, execErr
 		}
-		localResult, localErr := b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
+		localResult, localDuration, localErr := b.executeScriptLocally(ctx, blockID, height, script, arguments, insecureScriptHash)
 
-		b.compareScriptExecutionResults(execResult, execErr, localResult, localErr, blockID, script, insecureScriptHash)
+		b.compareScriptExecutionResults(execResult, execDuration, execErr, localResult, localDuration, localErr, blockID, script, arguments, insecureScriptHash)
 
 		// always return EN results
 		return execResult, execErr
@@ -146,7 +148,7 @@ func (b *backendScripts) executeScriptLocally(
 	script []byte,
 	arguments [][]byte,
 	insecureScriptHash [md5.Size]byte,
-) ([]byte, error) {
+) ([]byte, time.Duration, error) {
 	execStartTime := time.Now()
 
 	result, err := b.scriptExecutor.ExecuteAtBlockHeight(ctx, script, arguments, height)
@@ -174,7 +176,7 @@ func (b *backendScripts) executeScriptLocally(
 			b.metrics.ScriptExecutionErrorLocal()
 		}
 
-		return nil, convertedErr
+		return nil, execDuration, convertedErr
 	}
 
 	if b.log.GetLevel() == zerolog.DebugLevel && b.shouldLogScript(execEndTime, insecureScriptHash) {
@@ -187,7 +189,7 @@ func (b *backendScripts) executeScriptLocally(
 	// log execution time
 	b.metrics.ScriptExecuted(execDuration, len(script))
 
-	return result, nil
+	return result, execDuration, nil
 }
 
 // executeScriptOnAvailableExecutionNodes executes the provided script using available execution nodes.
@@ -197,11 +199,11 @@ func (b *backendScripts) executeScriptOnAvailableExecutionNodes(
 	script []byte,
 	arguments [][]byte,
 	insecureScriptHash [md5.Size]byte,
-) ([]byte, error) {
+) ([]byte, time.Duration, error) {
 	// find few execution nodes which have executed the block earlier and provided an execution receipt for it
 	executors, err := executionNodesForBlockID(ctx, blockID, b.executionReceipts, b.state, b.log)
 	if err != nil {
-		return nil, status.Errorf(codes.Internal, "failed to find script executors at blockId %v: %v", blockID.String(), err)
+		return nil, 0, status.Errorf(codes.Internal, "failed to find script executors at blockId %v: %v", blockID.String(), err)
 	}
 
 	lg := b.log.With().
@@ -210,21 +212,27 @@ func (b *backendScripts) executeScriptOnAvailableExecutionNodes(
 		Logger()
 
 	var result []byte
+	var execDuration time.Duration
 	errToReturn := b.nodeCommunicator.CallAvailableNode(
 		executors,
 		func(node *flow.Identity) error {
 			execStartTime := time.Now()
+
 			result, err = b.tryExecuteScriptOnExecutionNode(ctx, node.Address, blockID, script, arguments)
+
+			executionTime := time.Now()
+			execDuration = executionTime.Sub(execStartTime)
+
 			if err != nil {
 				return err
 			}
 
 			if b.log.GetLevel() == zerolog.DebugLevel {
-				executionTime := time.Now()
 				if b.shouldLogScript(executionTime, insecureScriptHash) {
 					lg.Debug().
 						Str("script_executor_addr", node.Address).
 						Str("script", string(script)).
+						Dur("execution_dur_ms", execDuration).
 						Msg("Successfully executed script")
 					b.loggedScripts.Add(insecureScriptHash, executionTime)
 				}
@@ -252,10 +260,10 @@ func (b *backendScripts) executeScriptOnAvailableExecutionNodes(
 			b.metrics.ScriptExecutionErrorOnExecutionNode()
 			b.log.Error().Err(errToReturn).Msg("script execution failed for execution node internal reasons")
 		}
-		return nil, rpc.ConvertError(errToReturn, "failed to execute script on execution nodes", codes.Internal)
+		return nil, execDuration, rpc.ConvertError(errToReturn, "failed to execute script on execution nodes", codes.Internal)
 	}
 
-	return result, nil
+	return result, execDuration, nil
 }
 
 // tryExecuteScriptOnExecutionNode attempts to execute the script on the given execution node.
@@ -291,17 +299,20 @@ func isInvalidArgumentError(scriptExecutionErr error) bool {
 
 func (b *backendScripts) compareScriptExecutionResults(
 	execNodeResult []byte,
+	execDuration time.Duration,
 	execErr error,
 	localResult []byte,
+	localDuration time.Duration,
 	localErr error,
 	blockID flow.Identifier,
 	script []byte,
+	arguments [][]byte,
 	insecureScriptHash [md5.Size]byte,
 ) {
 	// record errors caused by missing local data
 	if localErr != nil && status.Code(localErr) == codes.OutOfRange {
 		b.metrics.ScriptExecutionNotIndexed()
-		b.logScriptExecutionComparison(execNodeResult, execErr, localResult, localErr, blockID, script, insecureScriptHash,
+		b.logScriptExecutionComparison(execNodeResult, execDuration, execErr, localResult, localDuration, localErr, blockID, script, arguments, insecureScriptHash,
 			"script execution results do not match EN because data is not indexed yet")
 		return
 	}
@@ -322,14 +333,26 @@ func (b *backendScripts) compareScriptExecutionResults(
 			// should match.
 			execParts := strings.Split(execErr.Error(), "failed to execute script at block")
 			localParts := strings.Split(localErr.Error(), "failed to execute script at block")
-			if len(execParts) == 2 && len(localParts) == 2 && execParts[1] == localParts[1] {
-				b.metrics.ScriptExecutionErrorMatch()
-				return
+			if len(execParts) == 2 && len(localParts) == 2 {
+				if execParts[1] == localParts[1] {
+					b.metrics.ScriptExecutionErrorMatch()
+					return
+				}
+
+				// by default ENs are configured with longer script error size limits. check if the
+				// non-truncated parts match
+				localSubParts := strings.Split(localParts[1], " ... ")
+				if len(localSubParts) == 2 &&
+					strings.HasPrefix(execParts[1], localSubParts[0]) &&
+					strings.HasSuffix(execParts[1], localSubParts[1]) {
+					b.metrics.ScriptExecutionErrorMatch()
+					return
+				}
 			}
 		}
 
 		b.metrics.ScriptExecutionErrorMismatch()
-		b.logScriptExecutionComparison(execNodeResult, execErr, localResult, localErr, blockID, script, insecureScriptHash,
+		b.logScriptExecutionComparison(execNodeResult, execDuration, execErr, localResult, localDuration, localErr, blockID, script, arguments, insecureScriptHash,
 			"cadence errors from local execution do not match and EN")
 		return
 	}
@@ -340,37 +363,48 @@ func (b *backendScripts) compareScriptExecutionResults(
 	}
 
 	b.metrics.ScriptExecutionResultMismatch()
-	b.logScriptExecutionComparison(execNodeResult, execErr, localResult, localErr, blockID, script, insecureScriptHash,
+	b.logScriptExecutionComparison(execNodeResult, execDuration, execErr, localResult, localDuration, localErr, blockID, script, arguments, insecureScriptHash,
 		"script execution results from local execution do not match EN")
 }
 
 // logScriptExecutionComparison logs the script execution comparison between local execution and execution node
 func (b *backendScripts) logScriptExecutionComparison(
 	execNodeResult []byte,
+	execDuration time.Duration,
 	execErr error,
 	localResult []byte,
+	localDuration time.Duration,
 	localErr error,
 	blockID flow.Identifier,
 	script []byte,
+	arguments [][]byte,
 	insecureScriptHash [md5.Size]byte,
 	msg string,
 ) {
+	args := make([]string, len(arguments))
+	for i, arg := range arguments {
+		args[i] = string(arg)
+	}
+
 	lgCtx := b.log.With().
 		Hex("block_id", blockID[:]).
+		Hex("script_hash", insecureScriptHash[:]).
 		Str("script", string(script)).
-		Hex("script_hash", insecureScriptHash[:])
+		Strs("args", args)
 
 	if execErr != nil {
 		lgCtx = lgCtx.AnErr("execution_node_error", execErr)
 	} else {
 		lgCtx = lgCtx.Hex("execution_node_result", execNodeResult)
 	}
+	lgCtx = lgCtx.Dur("execution_node_duration_ms", execDuration)
 
 	if localErr != nil {
 		lgCtx = lgCtx.AnErr("local_error", localErr)
 	} else {
 		lgCtx = lgCtx.Hex("local_result", localResult)
 	}
+	lgCtx = lgCtx.Dur("local_duration_ms", localDuration)
 
 	lg := lgCtx.Logger()
 	lg.Debug().Msg(msg)