-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add semi-sync monitor to unblock primaries blocked on semi-sync ACKs #17763
base: main
Are you sure you want to change the base?
Changes from all commits
771dc7e
4dcaf54
8a47cc0
bf9b845
955d0c0
3124339
77f964f
2b2708f
0be0743
d33b460
6092978
7286419
4725dd9
a8d2cbd
da4a8f8
6cf02ab
942eaf0
3a3db2a
eaa9246
a571e48
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,15 +19,18 @@ package newfeaturetest | |
import ( | ||
"context" | ||
"fmt" | ||
"os/exec" | ||
"sync" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
|
||
"vitess.io/vitess/go/mysql" | ||
"vitess.io/vitess/go/test/endtoend/cluster" | ||
"vitess.io/vitess/go/test/endtoend/reparent/utils" | ||
"vitess.io/vitess/go/vt/log" | ||
"vitess.io/vitess/go/vt/vtctl/reparentutil/policy" | ||
) | ||
|
||
|
@@ -234,3 +237,94 @@ func TestBufferingWithMultipleDisruptions(t *testing.T) { | |
// Wait for all the writes to have succeeded. | ||
wg.Wait() | ||
} | ||
|
||
// TestSemiSyncBlockDueToDisruption tests that Vitess can recover from a situation | ||
// where a primary is stuck waiting for semi-sync ACKs due to a network issue, | ||
// even if no new writes from the user arrives. | ||
func TestSemiSyncBlockDueToDisruption(t *testing.T) { | ||
t.Skip("Test not meant to be run on CI") | ||
clusterInstance := utils.SetupReparentCluster(t, policy.DurabilitySemiSync) | ||
defer utils.TeardownCluster(clusterInstance) | ||
tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets | ||
utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) | ||
|
||
// stop heartbeats on all the replicas | ||
for idx, tablet := range tablets { | ||
if idx == 0 { | ||
continue | ||
} | ||
utils.RunSQLs(context.Background(), t, []string{ | ||
"stop slave;", | ||
"change master to MASTER_HEARTBEAT_PERIOD = 0;", | ||
"start slave;", | ||
Comment on lines
+257
to
+259
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. w/o the heartbeats... could we also set |
||
}, tablet) | ||
} | ||
|
||
// Take a backup of the pf.conf file | ||
runCommandWithSudo(t, "cp", "/etc/pf.conf", "/etc/pf.conf.backup") | ||
defer func() { | ||
// Restore the file from backup | ||
runCommandWithSudo(t, "mv", "/etc/pf.conf.backup", "/etc/pf.conf") | ||
runCommandWithSudo(t, "pfctl", "-f", "/etc/pf.conf") | ||
}() | ||
// Disrupt the network between the primary and the replicas | ||
runCommandWithSudo(t, "sh", "-c", fmt.Sprintf("echo 'block in proto tcp from any to any port %d' | sudo tee -a /etc/pf.conf > /dev/null", tablets[0].MySQLPort)) | ||
|
||
// This following command is only required if pfctl is not already enabled | ||
//runCommandWithSudo(t, "pfctl", "-e") | ||
runCommandWithSudo(t, "pfctl", "-f", "/etc/pf.conf") | ||
rules := runCommandWithSudo(t, "pfctl", "-s", "rules") | ||
log.Errorf("Rules enforced - %v", rules) | ||
|
||
// Start a write that will be blocked by the primary waiting for semi-sync ACKs | ||
ch := make(chan any) | ||
go func() { | ||
defer func() { | ||
close(ch) | ||
}() | ||
utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) | ||
}() | ||
|
||
// Starting VTOrc later now, because we don't want it to fix the heartbeat interval | ||
// on the replica's before the disruption has been introduced. | ||
err := clusterInstance.StartVTOrc(clusterInstance.Keyspaces[0].Name) | ||
require.NoError(t, err) | ||
go func() { | ||
for { | ||
select { | ||
case <-ch: | ||
return | ||
case <-time.After(1 * time.Second): | ||
str, isPresent := tablets[0].VttabletProcess.GetVars()["SemiSyncMonitorWritesBlocked"] | ||
if isPresent { | ||
log.Errorf("SemiSyncMonitorWritesBlocked - %v", str) | ||
} | ||
} | ||
} | ||
}() | ||
// If the network disruption is too long lived, then we will end up running ERS from VTOrc. | ||
networkDisruptionDuration := 43 * time.Second | ||
time.Sleep(networkDisruptionDuration) | ||
|
||
// Restore the network | ||
runCommandWithSudo(t, "cp", "/etc/pf.conf.backup", "/etc/pf.conf") | ||
runCommandWithSudo(t, "pfctl", "-f", "/etc/pf.conf") | ||
|
||
// We expect the problem to be resolved in less than 30 seconds. | ||
select { | ||
case <-time.After(30 * time.Second): | ||
t.Errorf("Timed out waiting for semi-sync to be unblocked") | ||
case <-ch: | ||
log.Errorf("Woohoo, write finished!") | ||
} | ||
} | ||
|
||
// runCommandWithSudo runs the provided command with sudo privileges | ||
// when the command is run, it prompts the user for the password, and it must be | ||
// entered for the program to resume. | ||
func runCommandWithSudo(t *testing.T, args ...string) string { | ||
cmd := exec.Command("sudo", args...) | ||
out, err := cmd.CombinedOutput() | ||
assert.NoError(t, err, string(out)) | ||
return string(out) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
/* | ||
Copyright 2025 The Vitess Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
CREATE TABLE IF NOT EXISTS semisync_recover | ||
( | ||
ts BIGINT UNSIGNED NOT NULL, | ||
PRIMARY KEY (`ts`) | ||
) ENGINE = InnoDB CHARSET = utf8mb4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should only do this if the
CI
env var is set. Otherwise you have to edit the test to run it locally as well. We do this today in a number of other places.