From 9c185596918085f11ddc959d950f269bef43b74d Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Mon, 23 May 2022 12:55:46 -0700
Subject: [PATCH] Reduce flakiness in test fts_segment_reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Have seen some flakiness in test fts_segment_reset because sometimes
FTS would still promote mirror if the primary takes a bit longer to
restart after getting out of RESET stage. An example like below:

- Primary 0 gets out of RESET and was going to be restarted:
2022-05-23 15:32:53.924540 UTC,,,p105578,th1560833280,,,,0,,,seg0,,,,,"LOG","00000","all server processes terminated; reinitializing",,,,,,,0,,"postmaster.c",4284,
- And it takes primary 0 about 2-3 seconds to do so:
2022-05-23 15:32:56.184117 UTC,,,p105578,th1560833280,,,,0,,,seg0,,,,,"LOG","00000","database system is ready to accept connections”

- Unfortunately before primary 0 could restart, FTS makes one last probe
and finds that it is in recovery mode, and not making progress (which is
"correct" because primary 0 has finished recovery):
2022-05-23 15:32:56.009206 UTC,,,p102591,th2023709952,,,,0,con3,,seg-1,,,,,"LOG","00000","FTS: detected segment is in recovery mode and not making progress (content=0) primary dbid=2, mirror dbid=5",,,,,,,0,,"ftsprobe.c",254,
2022-05-23 15:32:56.065399 UTC,,,p102591,th2023709952,,,,0,con3,,seg-1,,,,,"LOG","00000","FTS max (5) retries exhausted (content=0, dbid=2) state=9",,,,,,,0,,"ftsprobe.c”,788

Currently, we let primary stay in the RESET stage for 27 seconds.
The FTS has a default of 5-second retry cycle, at the end of which
it makes promote decision. That leaves about 3 seconds for the primary
to start after getting out of RESET, which is probably too short.
Now make the retry cycle 15 seconds and let the RESET delay to be 17
seconds. That leave about 13 seconds for the primary to start after that,
which should be well enough to reduce common flakiness.
---
 .../isolation2/expected/fts_segment_reset.out  | 18 ++++++++++++------
 src/test/isolation2/sql/fts_segment_reset.sql  | 17 +++++++++++------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/test/isolation2/expected/fts_segment_reset.out b/src/test/isolation2/expected/fts_segment_reset.out
index 943d06fbbc9..141c104f772 100644
--- a/src/test/isolation2/expected/fts_segment_reset.out
+++ b/src/test/isolation2/expected/fts_segment_reset.out
@@ -10,6 +10,12 @@
 -- start_ignore
 alter system set gp_fts_probe_interval to 10;
 ALTER
+-- Because after RESET, it still takes a little while for the primary
+-- to restart, and potentially makes FTS think it's in "recovery not
+-- in progress" stage and promote the mirror, we would need the FTS
+-- to make that decision a bit less frequently.
+alter system set gp_fts_probe_retries to 15;
+ALTER
 select pg_reload_conf();
  pg_reload_conf 
 ----------------
@@ -17,12 +23,11 @@ select pg_reload_conf();
 (1 row)
 -- end_ignore
 
--- Let the background writer sleep 27 seconds to delay the resetting.
--- This number is selected because there's a slight chance that FTS senses
--- "recovery not in progress" after its 5-second retry window and promote
--- the mirror. So just put the end of the sleep perid away from the end
--- of the retry windows.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 27, dbid) from gp_segment_configuration where role = 'p' and content = 0;
+-- Let the background writer sleep 17 seconds to delay the resetting.
+-- This number is selected to be larger than the 15-second retry window
+-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
+-- a "recovery not in progress" primary as much as possible.
+select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) from gp_segment_configuration where role = 'p' and content = 0;
  gp_inject_fault 
 -----------------
  Success:        
@@ -94,6 +99,7 @@ select pg_sleep(30);
 -- start_ignore
 -- restore parameters
 alter system reset gp_fts_probe_interval;
+alter system reset gp_fts_probe_retries;
 select pg_reload_conf();
 -- end_ignore
 
diff --git a/src/test/isolation2/sql/fts_segment_reset.sql b/src/test/isolation2/sql/fts_segment_reset.sql
index 7f76a62593c..2480f97dfee 100644
--- a/src/test/isolation2/sql/fts_segment_reset.sql
+++ b/src/test/isolation2/sql/fts_segment_reset.sql
@@ -9,15 +9,19 @@
 -- Let FTS detect/declare failure sooner 
 -- start_ignore
 alter system set gp_fts_probe_interval to 10;
+-- Because after RESET, it still takes a little while for the primary
+-- to restart, and potentially makes FTS think it's in "recovery not
+-- in progress" stage and promote the mirror, we would need the FTS
+-- to make that decision a bit less frequently.
+alter system set gp_fts_probe_retries to 15;
 select pg_reload_conf();
 -- end_ignore
 
--- Let the background writer sleep 27 seconds to delay the resetting.
--- This number is selected because there's a slight chance that FTS senses 
--- "recovery not in progress" after its 5-second retry window and promote 
--- the mirror. So just put the end of the sleep perid away from the end 
--- of the retry windows.
-select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 27, dbid) 
+-- Let the background writer sleep 17 seconds to delay the resetting.
+-- This number is selected to be larger than the 15-second retry window
+-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
+-- a "recovery not in progress" primary as much as possible.
+select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) 
 from gp_segment_configuration where role = 'p' and content = 0;
 
 -- Do not let the postmaster send SIGKILL to the bgwriter
@@ -54,6 +58,7 @@ select pg_sleep(30);
 -- start_ignore
 -- restore parameters
 alter system reset gp_fts_probe_interval;
+alter system reset gp_fts_probe_retries;
 select pg_reload_conf();
 -- end_ignore