Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce flakiness in test fts_segment_reset #518

Merged
merged 1 commit into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions src/test/isolation2/expected/fts_segment_reset.out
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,24 @@
-- start_ignore
alter system set gp_fts_probe_interval to 10;
ALTER
-- Because after RESET, it still takes a little while for the primary
-- to restart, and potentially makes FTS think it's in "recovery not
-- in progress" stage and promote the mirror, we would need the FTS
-- to make that decision a bit less frequently.
alter system set gp_fts_probe_retries to 15;
ALTER
select pg_reload_conf();
pg_reload_conf
----------------
t
(1 row)
-- end_ignore

-- Let the background writer sleep 27 seconds to delay the resetting.
-- This number is selected because there's a slight chance that FTS senses
-- "recovery not in progress" after its 5-second retry window and promote
-- the mirror. So just put the end of the sleep perid away from the end
-- of the retry windows.
select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 27, dbid) from gp_segment_configuration where role = 'p' and content = 0;
-- Let the background writer sleep 17 seconds to delay the resetting.
-- This number is selected to be larger than the 15-second retry window
-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
-- a "recovery not in progress" primary as much as possible.
select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid) from gp_segment_configuration where role = 'p' and content = 0;
gp_inject_fault
-----------------
Success:
Expand Down Expand Up @@ -94,6 +99,7 @@ select pg_sleep(30);
-- start_ignore
-- restore parameters
alter system reset gp_fts_probe_interval;
alter system reset gp_fts_probe_retries;
select pg_reload_conf();
-- end_ignore

Expand Down
17 changes: 11 additions & 6 deletions src/test/isolation2/sql/fts_segment_reset.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,19 @@
-- Let FTS detect/declare failure sooner
-- start_ignore
alter system set gp_fts_probe_interval to 10;
-- Because after RESET, it still takes a little while for the primary
-- to restart, and potentially makes FTS think it's in "recovery not
-- in progress" stage and promote the mirror, we would need the FTS
-- to make that decision a bit less frequently.
alter system set gp_fts_probe_retries to 15;
select pg_reload_conf();
-- end_ignore

-- Let the background writer sleep 27 seconds to delay the resetting.
-- This number is selected because there's a slight chance that FTS senses
-- "recovery not in progress" after its 5-second retry window and promote
-- the mirror. So just put the end of the sleep perid away from the end
-- of the retry windows.
select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 27, dbid)
-- Let the background writer sleep 17 seconds to delay the resetting.
-- This number is selected to be larger than the 15-second retry window
-- which makes a meaningful test, meanwhile reduce the chance that FTS sees
-- a "recovery not in progress" primary as much as possible.
select gp_inject_fault('fault_in_background_writer_quickdie', 'sleep', '', '', '', 1, 1, 17, dbid)
from gp_segment_configuration where role = 'p' and content = 0;

-- Do not let the postmaster send SIGKILL to the bgwriter
Expand Down Expand Up @@ -54,6 +58,7 @@ select pg_sleep(30);
-- start_ignore
-- restore parameters
alter system reset gp_fts_probe_interval;
alter system reset gp_fts_probe_retries;
select pg_reload_conf();
-- end_ignore

Expand Down
Loading