diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index e0e02a4ed..751c12efb 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -340,6 +340,51 @@ module HaType SYNC = "sync" end + def prepare_switchover(force = false) + if parent.nil? || !logical_replication + fail "Database does not have parent or is not in logical replication state" + end + + if !force + err = "" + replica_dbs = representative_server.list_all_databases + parent_dbs = parent.representative_server.list_all_databases + db_diff = parent_dbs - replica_dbs + + if db_diff.any? + err = "The following databases were not synced to replica: #{db_diff.join(",")}\n" + end + + replica_roles = representative_server.list_all_roles + parent_roles = parent.representative_server.list_all_roles + roles_diff = parent_roles - replica_roles + + if roles_diff.any? + err = "#{err}The following roles were not synced to replica: #{roles_diff.join(",")}\n" + end + + lo_count_replica = representative_server.run_query("SELECT COUNT(*) FROM pg_largeobject_metadata") + lo_count_parent = parent.representative_server.run_query("SELECT COUNT(*) FROM pg_largeobject_metadata") + lo_diff = lo_count_parent.to_i - lo_count_replica.to_i + + if lo_diff > 0 + err = "#{err}Parent database has #{lo_diff} more large objects than replica\n" + end + + if !err.empty? + err = "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}" + fail err + end + else + current_frame = strand.stack.first + current_frame["force_switchover"] = true + strand.modified!(:stack) + strand.save_changes + end + + incr_switchover_with_parent + end + def rollback_switchover current_resource = LanternResource[rollback_target] # stop current one and start old one diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 121d3f57a..eeb08bdc0 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -125,6 +125,13 @@ def configure_hash postgresql_recovery_target_lsn = "" end + big_query_dataset = "" + + if vm.cores > 1 + # enable big query logs only if this is not the smallest instance + big_query_dataset = Config.lantern_log_dataset + end + JSON.generate({ enable_coredumps: true, skip_deps: vm.boot_image != Config.gcp_default_image, @@ -153,7 +160,7 @@ def configure_hash gcp_creds_walg_b64: walg_config[:gcp_creds_b64], walg_gs_prefix: walg_config[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset, + big_query_dataset: big_query_dataset, pg_version: resource.pg_version }) end @@ -249,6 +256,20 @@ def list_all_databases .map { _1.strip } end + def list_all_roles(login = true) + condition = if login + "WHERE rolcanlogin=TRUE" + else + "" + end + + vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles #{condition}'") + .chomp + .strip + .split("\n") + .map { _1.strip } + end + def autoresize_disk new_storage_size = (target_storage_size_gib * 1.5).clamp(..max_storage_autoresize_gib) return if new_storage_size < target_storage_size_gib diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index a25289a0d..055dc20ef 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -274,6 +274,11 @@ def before_run lantern_resource.mark_switchover_finish + current_frame = strand.stack.first + current_frame.delete("force_switchover") + strand.modified!(:stack) + strand.save_changes + hop_wait end @@ -348,7 +353,7 @@ def before_run end label def wait_for_synchronization - nap 5 if lantern_resource.parent.get_logical_replication_lag("slot_#{lantern_resource.ubid}") != 0 + nap 5 if !frame["force_switchover"] && lantern_resource.parent.get_logical_replication_lag("slot_#{lantern_resource.ubid}") != 0 hop_delete_logical_subscription end diff --git a/routes/api/project/location/lantern.rb b/routes/api/project/location/lantern.rb index c1c4f0894..756fd55b9 100644 --- a/routes/api/project/location/lantern.rb +++ b/routes/api/project/location/lantern.rb @@ -179,14 +179,12 @@ class CloverApi r.post "switchover" do Authorization.authorize(@current_user.id, "Postgres:edit", pg.id) - - if pg.parent.nil? || !pg.logical_replication - fail CloverError.new(400, "Invalid request", "Database does not have parent or is not in logical replication state") - end - - pg.incr_switchover_with_parent + pg.prepare_switchover(r.params["force"]) response.status = 200 r.halt + rescue => e + response.status = 422 + return {"error" => e.message} end r.post "rollback-switchover" do diff --git a/spec/model/lantern/lantern_resource_spec.rb b/spec/model/lantern/lantern_resource_spec.rb index b2e48c22a..38115dab4 100644 --- a/spec/model/lantern/lantern_resource_spec.rb +++ b/spec/model/lantern/lantern_resource_spec.rb @@ -406,4 +406,101 @@ expect { lantern_resource.mark_switchover_finish }.not_to raise_error end end + + describe "#prepare_switchover" do + it "fails if no parent" do + expect(lantern_resource).to receive(:parent).and_return(nil) + expect { lantern_resource.prepare_switchover }.to raise_error "Database does not have parent or is not in logical replication state" + end + + it "fails if not in logical replication" do + parent = instance_double(described_class) + expect(lantern_resource).to receive(:parent).and_return(parent) + expect(lantern_resource).to receive(:logical_replication).and_return(false) + expect { lantern_resource.prepare_switchover }.to raise_error "Database does not have parent or is not in logical replication state" + end + + it "success if force" do + parent = instance_double(described_class) + expect(lantern_resource).to receive(:parent).and_return(parent) + expect(lantern_resource).to receive(:logical_replication).and_return(true) + expect(lantern_resource).to receive(:incr_switchover_with_parent) + frame = instance_double(Hash) + strand = instance_double(Strand, stack: [frame]) + expect(frame).to receive(:[]=).with("force_switchover", true) + expect(strand).to receive(:modified!) + expect(strand).to receive(:save_changes) + expect(lantern_resource).to receive(:strand).and_return(strand).at_least(:once) + expect { lantern_resource.prepare_switchover(true) }.not_to raise_error + end + + it "fails if db list differs" do + representative_server = instance_double(LanternServer) + parent_representative_server = instance_double(LanternServer) + parent = instance_double(described_class, representative_server: parent_representative_server) + parent_databases = ["db1", "db2", "db3"] + replica_databases = ["db1"] + + allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent) + expect(lantern_resource).to receive(:logical_replication).and_return(true) + + allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: [], run_query: "0") + allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: [], run_query: "0") + + err = "The following databases were not synced to replica: db2,db3\n" + expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}" + end + + it "fails if role list differs" do + representative_server = instance_double(LanternServer) + parent_representative_server = instance_double(LanternServer) + parent = instance_double(described_class, representative_server: parent_representative_server) + parent_databases = ["db1", "db2", "db3"] + replica_databases = ["db1", "db2", "db3"] + + allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent) + expect(lantern_resource).to receive(:logical_replication).and_return(true) + + allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres", "role2"], run_query: "0") + allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "0") + + err = "The following roles were not synced to replica: role2\n" + expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}" + end + + it "fails if large object count differs" do + representative_server = instance_double(LanternServer) + parent_representative_server = instance_double(LanternServer) + parent = instance_double(described_class, representative_server: parent_representative_server) + parent_databases = ["db1", "db2", "db3"] + replica_databases = ["db1"] + + allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent) + expect(lantern_resource).to receive(:logical_replication).and_return(true) + + allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres", "role2"], run_query: "5") + allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "0") + + err = "The following databases were not synced to replica: db2,db3\nThe following roles were not synced to replica: role2\nParent database has 5 more large objects than replica\n" + expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}" + end + + it "success if all conditions pass" do + representative_server = instance_double(LanternServer) + parent_representative_server = instance_double(LanternServer) + parent = instance_double(described_class, representative_server: parent_representative_server) + parent_databases = ["db1", "db2", "db3"] + replica_databases = ["db1", "db2", "db3"] + + allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent) + expect(lantern_resource).to receive(:logical_replication).and_return(true) + + allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres"], run_query: "5") + allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "5") + + allow(lantern_resource).to receive(:incr_switchover_with_parent) + + expect { lantern_resource.prepare_switchover }.not_to raise_error + end + end end diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index 715ddd5a8..97807cf59 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -285,6 +285,7 @@ expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once) expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once) expect(vm).to receive(:boot_image).and_return(Config.gcp_default_image).at_least(:once) + expect(vm).to receive(:cores).and_return(2) walg_conf = timeline.generate_walg_config expected_conf = JSON.generate({ @@ -354,6 +355,7 @@ expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once) expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once) expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once) + expect(vm).to receive(:cores).and_return(2) walg_conf = timeline.generate_walg_config expected_conf = JSON.generate({ @@ -421,6 +423,7 @@ expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once) expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once) expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once) + expect(vm).to receive(:cores).and_return(1) walg_conf = timeline.generate_walg_config expected_conf = JSON.generate({ @@ -451,7 +454,7 @@ gcp_creds_walg_b64: walg_conf[:gcp_creds_b64], walg_gs_prefix: walg_conf[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset, + big_query_dataset: "", pg_version: 17 }) expect(lantern_server.configure_hash).to eq(expected_conf) @@ -489,6 +492,7 @@ expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once) expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once) expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once) + expect(vm).to receive(:cores).and_return(2) walg_conf = timeline.generate_walg_config expected_conf = JSON.generate({ @@ -696,6 +700,18 @@ end end + describe "#list_all_roles" do + it "returns list of all roles which can login" do + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles WHERE rolcanlogin=TRUE'").and_return("postgres\nrole2\n") + expect(lantern_server.list_all_roles).to eq(["postgres", "role2"]) + end + + it "returns list of all roles" do + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles '").and_return("postgres\nrole2\nrole3\n") + expect(lantern_server.list_all_roles(false)).to eq(["postgres", "role2", "role3"]) + end + end + describe "#get_vm_image" do it "returns default image" do allow(described_class).to receive(:get_vm_image).and_call_original diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index df1a26c63..cca97cd2f 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -450,6 +450,13 @@ expect(lantern_resource).to receive(:mark_switchover_finish) expect(timeline).to receive(:update).with(parent_id: nil) + frame = instance_double(Hash) + strand = instance_double(Strand, stack: [frame]) + expect(frame).to receive(:delete).with("force_switchover") + expect(strand).to receive(:modified!) + expect(strand).to receive(:save_changes) + expect(nx).to receive(:strand).and_return(strand).at_least(:once) + expect { nx.finish_take_over }.to hop("wait") end end @@ -471,12 +478,19 @@ parent = instance_double(LanternResource) expect(lantern_resource).to receive(:parent).and_return(parent) expect(parent).to receive(:get_logical_replication_lag).with("slot_#{lantern_resource.ubid}").and_return(5) + expect(nx).to receive(:frame).and_return({"force_switchover" => false}) expect { nx.wait_for_synchronization }.to nap(5) end + it "hops to delete_logical_subscription if force_switchover" do + expect(nx).to receive(:frame).and_return({"force_switchover" => true}) + expect { nx.wait_for_synchronization }.to hop("delete_logical_subscription") + end + it "hops to delete_logical_subscription" do parent = instance_double(LanternResource) + expect(nx).to receive(:frame).and_return({"force_switchover" => false}) expect(lantern_resource).to receive(:parent).and_return(parent) expect(parent).to receive(:get_logical_replication_lag).with("slot_#{lantern_resource.ubid}").and_return(0) diff --git a/spec/routes/api/project/location/lantern_spec.rb b/spec/routes/api/project/location/lantern_spec.rb index a79ad80a4..07c223a1f 100644 --- a/spec/routes/api/project/location/lantern_spec.rb +++ b/spec/routes/api/project/location/lantern_spec.rb @@ -373,31 +373,20 @@ query_res = class_double(LanternResource, first: pg) allow(query_res).to receive(:where).and_return(query_res) expect(project).to receive(:lantern_resources_dataset).and_return(query_res) + err = "Database does not have parent or is not in logical replication state" + expect(pg).to receive(:prepare_switchover).and_raise err post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover" - expect(last_response.status).to eq(400) - end - - it "fails because not in logical replication mode" do - expect(Project).to receive(:from_ubid).and_return(project).at_least(:once) - query_res = class_double(LanternResource, first: pg) - expect(pg).to receive(:parent).and_return(instance_double(LanternResource)) - expect(pg).to receive(:logical_replication).and_return(false) - allow(query_res).to receive(:where).and_return(query_res) - expect(project).to receive(:lantern_resources_dataset).and_return(query_res) - - post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover" - expect(last_response.status).to eq(400) + expect(last_response.status).to eq(422) + expect(JSON.parse(last_response.body)["error"]).to eq(err) end it "performs switchover" do expect(Project).to receive(:from_ubid).and_return(project).at_least(:once) query_res = class_double(LanternResource, first: pg) - expect(pg).to receive(:parent).and_return(instance_double(LanternResource)) - expect(pg).to receive(:logical_replication).and_return(true) allow(query_res).to receive(:where).and_return(query_res) expect(project).to receive(:lantern_resources_dataset).and_return(query_res) - expect(pg).to receive(:incr_switchover_with_parent) + expect(pg).to receive(:prepare_switchover) post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover" expect(last_response.status).to eq(200)