Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-4575][DPE-4886][DPE-4983] Add voting exclusions management #367

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
52fcb0d
Rebase and add support for voting exclusions
phvalguima Jul 8, 2024
8802410
Add integration test for 3->1->3 scaling; plus fixes
phvalguima Jul 8, 2024
d360fe9
Add fix for scaling test
phvalguima Jul 8, 2024
05c828b
Fix issue with continuous_writes
phvalguima Jul 8, 2024
186dadd
Add rudimentary check for shard relocation
phvalguima Jul 8, 2024
784dd0f
Move to self.unit_name
phvalguima Jul 8, 2024
b4b9fde
Add safeguards at stopping for the app removal case
phvalguima Jul 9, 2024
b9becc8
Move the _get_nodes up in _stop_opensearch
phvalguima Jul 9, 2024
f22f75b
Focus remove-app test to only remove opensearch
phvalguima Jul 9, 2024
ce0c3e6
Add explain call to update status
phvalguima Jul 10, 2024
3e59a79
Add explain call to update status
phvalguima Jul 10, 2024
b3a9c11
Add explain call to update status
phvalguima Jul 10, 2024
1bc6555
Add cluster explain API
phvalguima Jul 10, 2024
ee49e33
Fix explain on node_lock
phvalguima Jul 10, 2024
5a9e006
Node lock testing -- move logging
phvalguima Jul 10, 2024
e633312
Add try/catch for the allocation
phvalguima Jul 10, 2024
fc148a5
Extend voting exclusion settling to scenarios: covers outage cases; f…
phvalguima Jul 12, 2024
68ca4f0
Add unit tests for exclusions logic
phvalguima Jul 15, 2024
8fa6f29
Add fixes to unit tests and removed commented code
phvalguima Jul 15, 2024
48c511e
Add fixes for integration tests and unit tests
phvalguima Jul 15, 2024
3b16370
Update helper_cluster.py
phvalguima Jul 17, 2024
4004ba5
Add retry to the lock
phvalguima Jul 18, 2024
6c2137c
Remove fix for dashboards, diff PR; add comments on update_status
phvalguima Jul 18, 2024
c45e7e9
Update helper_cluster.py
phvalguima Jul 22, 2024
4591f2b
Update helper_cluster.py
phvalguima Jul 22, 2024
6ff1ad9
Updates following review
phvalguima Aug 6, 2024
9e0e20d
Merge remote-tracking branch 'origin/main' into with-tests-DPE-4057-v…
phvalguima Aug 6, 2024
c3b71e7
fix elected_manager to return the Node object instead of the ID strin…
phvalguima Aug 6, 2024
3ebadee
Fixes for exclusions, locking and health
phvalguima Aug 7, 2024
f0e0fec
Extend to a larger runner
phvalguima Aug 7, 2024
bbca393
WIP: updating integration test_ha for 2 nodes
phvalguima Aug 8, 2024
adeef52
WIP(2): updating integration test_ha for 2 nodes
phvalguima Aug 8, 2024
3d74325
Merge remote-tracking branch 'origin' into with-tests-DPE-4057-voting…
phvalguima Aug 8, 2024
e14ae76
Fix scale up and down test
phvalguima Aug 9, 2024
b909c16
Update from main branch
phvalguima Aug 12, 2024
af9fa25
move from xlarge to large
phvalguima Aug 12, 2024
7c1595a
Add more info in the test_restart_db_process_node_with_elected_cm
phvalguima Aug 12, 2024
3851b2d
add more logging
phvalguima Aug 12, 2024
631b6a8
_on_update_status: remove the is_node_up() from the start and move it…
phvalguima Aug 13, 2024
8972bf3
Merge remote-tracking branch 'origin' into with-tests-DPE-4057-voting…
phvalguima Aug 13, 2024
ad524e6
Fix node exclusions to also check for service running, plus manually …
phvalguima Aug 13, 2024
77b1c67
remove pdb mentions to test_ha
phvalguima Aug 13, 2024
d84e1d5
Extend is_active to consider other cases such as an stopped process
phvalguima Aug 14, 2024
bd97709
Add missing rstrip()
phvalguima Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion lib/charms/opensearch/v0/helper_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ def nodes_by_role(nodes: List[Node]) -> Dict[str, List[Node]]:

return result

@staticmethod
def elected_manager(nodes: List[Node]) -> Optional[List[str]]:
"""Get the list of nodes in a cluster."""
for node in nodes:
if node.elected_manager:
return node
return None

@staticmethod
def nodes(
opensearch: OpenSearchDistribution,
Expand All @@ -153,18 +161,28 @@ def nodes(

nodes: List[Node] = []
if use_localhost or host:
manager_id = opensearch.request(
"GET",
"/_cluster/state/cluster_manager_node",
host=host,
alt_hosts=alt_hosts,
retries=3,
)
if "cluster_manager_node" in manager_id:
manager_id = manager_id["cluster_manager_node"]
response = opensearch.request(
"GET", "/_nodes", host=host, alt_hosts=alt_hosts, retries=3
)
if "nodes" in response:
for obj in response["nodes"].values():
for id, obj in response["nodes"].items():
node = Node(
name=obj["name"],
roles=obj["roles"],
ip=obj["ip"],
app=App(id=obj["attributes"]["app_id"]),
unit_number=int(obj["name"].split(".")[0].split("-")[-1]),
temperature=obj.get("attributes", {}).get("temp"),
elected_manager=id == manager_id,
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
)
nodes.append(node)
return nodes
Expand Down
1 change: 1 addition & 0 deletions lib/charms/opensearch/v0/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ class Node(Model):
app: App
unit_number: int
temperature: Optional[str] = None
elected_manager: bool = False

@classmethod
@validator("roles")
Expand Down
152 changes: 126 additions & 26 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,13 @@
)
from ops.framework import EventBase, EventSource
from ops.model import BlockedStatus, MaintenanceStatus, WaitingStatus
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
from tenacity import (
RetryError,
Retrying,
stop_after_attempt,
stop_after_delay,
wait_fixed,
)

import lifecycle
import upgrade
Expand Down Expand Up @@ -436,7 +442,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent):
if not (unit_data := event.relation.data.get(event.unit)):
return

self.opensearch_exclusions.cleanup()
self.opensearch_exclusions.allocation_cleanup()

if self.unit.is_leader() and unit_data.get("bootstrap_contributor"):
contributor_count = self.peers_data.get(Scope.APP, "bootstrap_contributors_count", 0)
Expand Down Expand Up @@ -471,10 +477,15 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
logger.warning(
"Removing units during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
)
# acquire lock to ensure only 1 unit removed at a time
if not self.node_lock.acquired:
# Raise uncaught exception to prevent Juju from removing unit
raise Exception("Unable to acquire lock: Another unit is starting or stopping.")

for attempt in Retrying(stop=stop_after_attempt(6), wait=wait_fixed(10), reraise=True):
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
with attempt:
# acquire lock to ensure only 1 unit removed at a time
if not self.node_lock.acquired:
# Raise uncaught exception to prevent Juju from removing unit
raise Exception(
"Unable to acquire lock: Another unit is starting or stopping."
)

# if the leader is departing, and this hook fails "leader elected" won"t trigger,
# so we want to re-balance the node roles from here
Expand Down Expand Up @@ -515,7 +526,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
# release lock
self.node_lock.release()

def _on_update_status(self, event: UpdateStatusEvent):
def _on_update_status(self, event: UpdateStatusEvent): # noqa: C901
"""On update status event.

We want to periodically check for the following:
Expand All @@ -537,17 +548,26 @@ def _on_update_status(self, event: UpdateStatusEvent):

# if there are exclusions to be removed
if self.unit.is_leader():
self.opensearch_exclusions.cleanup()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While before this cleanup was done on every update_status, now it's only done when the Health is green. Is this on purpose?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is not the case... It is done on every case, except HealthColors.UNKNOWN. Indeed, we defer the event if it is not green... I put it down there because I need the API to be responsive before configuring voting exclusions. If it is not responsive, we will get UNKNOWN anyways and retry later anyways.

I will add some comments to clarify that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why the shards allocation exclusion cleanup is postponed until later in the hook? As long as there is connectivity to a host, we should be able to cleanup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the health checks below allow anything pass, unless the cluster is really on a bad state (i.e. UNKNOWN). So, moved the check below these first health checks because it makes more sense.


if (health := self.health.apply(wait_for_green_first=True)) not in [
HealthColors.GREEN,
HealthColors.IGNORE,
]:
# Do not return right now!
# We must first check if we need to remove exclusions
event.defer()

# Unless it is unknown, in this case we can return and wait for the next run
if health == HealthColors.UNKNOWN:
return

self.opensearch_exclusions.allocation_cleanup()
# Now, review voting exclusions, as we may have lost a unit due to an outage
try:
self._settle_voting_exclusions(unit_is_stopping=False)
except RetryError:
# We need to retry later as the cluster does not seem to be stable enough
event.defer()

for relation in self.model.relations.get(ClientRelationName, []):
self.opensearch_provider.update_endpoints(relation)

Expand All @@ -573,6 +593,9 @@ def _on_config_changed(self, event: ConfigChangedEvent): # noqa C901
restart_requested = False
if self.opensearch_config.update_host_if_needed():
restart_requested = True
# Review voting exclusions as our IP has changed: we may be coming back from a network
# outage case.
self._settle_voting_exclusions(unit_is_stopping=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What will happen here if no other node is online self.alt_hosts == []? I believe this will try for 5min and eventually crash with a RetryError.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Mehdi-Bendriss so, self.alt_hosts will not return the local host in the list? How can I get it then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed the way we call both ClusterTopology's elected_manager and nodes. They will now do more checks on self.alt_hosts.


self.status.set(MaintenanceStatus(TLSNewCertsRequested))
self.tls.delete_stored_tls_resources()
Expand Down Expand Up @@ -912,7 +935,8 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
self._cleanup_bootstrap_conf_if_applies()

# Remove the exclusions that could not be removed when no units were online
self.opensearch_exclusions.delete_current()
self._settle_voting_exclusions(unit_is_stopping=False)
self.opensearch_exclusions.delete_allocations_exclusion()

self.node_lock.release()

Expand Down Expand Up @@ -1004,37 +1028,39 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
if self.opensearch_peer_cm.is_provider():
self.peer_cluster_provider.refresh_relation_data(event, can_defer=False)

def _stop_opensearch(self, *, restart=False) -> None:
def _stop_opensearch(self, *, restart=False) -> None: # noqa: C901
"""Stop OpenSearch if possible."""
self.status.set(WaitingStatus(ServiceIsStopping))

nodes = self._get_nodes(True)
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
if self.opensearch.is_node_up():
try:
nodes = self._get_nodes(True)
# do not add exclusions if it's the last unit to stop
# otherwise cluster manager election will be blocked when starting up again
# and re-using storage
if len(nodes) > 1:
# TODO: we should probably NOT have any exclusion on restart
# https://chat.canonical.com/canonical/pl/bgndmrfxr7fbpgmwpdk3hin93c
# 1. Add current node to the voting + alloc exclusions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you put back those comments? the description is still current and valid.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first description, yes. The second: TODO: we should probably NOT have any exclusion on restart is outdated. Now, we only add nodes to the voting exclusions IF we are scaling between 3-1 nodes or vice versa.

self.opensearch_exclusions.add_current()
self.opensearch_exclusions.add_allocations_exclusion()
except OpenSearchHttpError:
logger.debug("Failed to get online nodes, voting and alloc exclusions not added")
logger.debug("Failed to get online nodes, alloc exclusion not added")

self._settle_voting_exclusions(unit_is_stopping=True)

# TODO: should block until all shards move addressed in PR DPE-2234
# TODO: improve relocation of all shards move in PR DPE-2234
if len(nodes) > 1:
# this check only makes sense if we have more than one unit.
for attempt in Retrying(stop=stop_after_delay(300), wait=wait_fixed(10)):
with attempt:
if self.health.apply(wait_for_green_first=True) != HealthColors.GREEN:
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
raise OpenSearchHAError(
"Timed out waiting for shard relocation to complete"
)

# 2. stop the service
self.opensearch.stop()
self.peers_data.delete(Scope.UNIT, "started")
self.status.set(WaitingStatus(ServiceStopped))

# 3. Remove the exclusions
# TODO: we should probably NOT have any exclusion on restart
# https://chat.canonical.com/canonical/pl/bgndmrfxr7fbpgmwpdk3hin93c
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
if not restart:
try:
self.opensearch_exclusions.delete_current()
self.opensearch_exclusions.delete_allocations_exclusion()
except Exception:
# It is purposefully broad - as this can fail for HTTP reasons,
# or if the config wasn't set on disk etc. In any way, this operation is on
Expand All @@ -1060,6 +1086,80 @@ def _restart_opensearch(self, event: _RestartOpenSearch) -> None:

self._start_opensearch_event.emit()

def _settle_voting_exclusions(self, unit_is_stopping: bool = False): # noqa C901
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
"""Settle the exclusions for all voting units."""
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
hosts = self.alt_hosts
if (
self.opensearch_peer_cm.deployment_desc().typ != DeploymentType.MAIN_ORCHESTRATOR
and (peer_cm_rel_data := self.opensearch_peer_cm.rel_data()) is not None
):
# Also consider peer-relation units
hosts.extend([node.ip for node in peer_cm_rel_data.cm_nodes])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not needed. self.alt_hosts is enough, it already contains all possible alternative hosts incl. from the large deployment relations.

Note: Please do not rely on rel_data for as long as you can - we should aim for a clear separation of concerns. The 2 checks can be replaced more robustly with self.opensearch_peer_cm.is_consumer().

Suggested change
if (
self.opensearch_peer_cm.deployment_desc().typ != DeploymentType.MAIN_ORCHESTRATOR
and (peer_cm_rel_data := self.opensearch_peer_cm.rel_data()) is not None
):
# Also consider peer-relation units
hosts.extend([node.ip for node in peer_cm_rel_data.cm_nodes])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If self.alt_hosts returns the right value in any case, do I still need that if and self.opensearch_peer_cm.is_consumer() then?


for attempt in Retrying(stop=stop_after_delay(300), wait=wait_fixed(10)):
with attempt:
nodes = ClusterTopology.nodes(self.opensearch, use_localhost=True, hosts=hosts)
phvalguima marked this conversation as resolved.
Show resolved Hide resolved

node = None
for node in nodes:
if node.name == self.unit_name:
break
if unit_is_stopping and node:
# This is a stop operation, we exclude the unit that is stopping
# if still in the list
nodes.remove(node)
# we can finish the for loop here
break
elif not unit_is_stopping and not node:
# This unit is starting, we must assure it shows up in the list
# Let's retry
raise OpenSearchStartError("Node not in the list of nodes")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this? if we call settle_voting_exclusions from post_start_init, why should we have this check + exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, settle voting logic must be executed synchronously with a (re)start. This is why I am adding these two retry loops (one now, another at its end). Ideally, if we have a problem here, that may cause the entire cluster to get stuck in "metadata error" as we were seeing before.

Because between powering up and actually having the service up takes a while, this for loop assures we wait long enough for the opensearch service to come up.

# Any other case is okay to move forward without changes
break
phvalguima marked this conversation as resolved.
Show resolved Hide resolved

cms = ClusterTopology.get_cluster_managers_names(nodes)
# For the sake of predictability, we always sort the cluster managers
sorted_cm = sorted(cms)
# We always clean the voting.
# Then, we deal with the specific cases.
self.opensearch_exclusions.delete_voting()

# Each of the possible cases
if len(cms) == 1:
# This condition only happens IF the cluster had 2x units and now is scalig down to 1
# In this scenario, we should exclude the node that is going away, to force election
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why is this needed? if the cluster scales down from 2 to 1 unit, doesn't that mean that an election will happen anyway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we made to this point, it means any previous voting exclusions have been already removed. So, we have 2x voting units available in this cluster.

The main problem now is, if we have 2x voting units, then the minimum quorum for us to keep working will be:

(N=2)  / 2 + 1 => 2 units

If I remove one unit, then the other cannot re-elect. So, I apply this exclusion to reduce the voting number down to 1x unit + also moving the cluster manager, if the unit going away was the manager.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It must happen before stopping, so we are sure the metadata will be updated on the unit that is staying.

This is the only way I got on my own tests to safely downscale from 2->1 unit.

# to the remaining unit.
if unit_is_stopping:
self.opensearch_exclusions.add_voting(hosts, node_names=[self.unit_name])
elif len(cms) == 2:
if unit_is_stopping:
# Remove both this unit and the first sorted_cm from the voting
self.opensearch_exclusions.add_voting(
hosts, node_names=[self.unit_name, sorted_cm[0]]
)
else:
# We are adding this unit to the cluster and we've waited until it is present
# We only exclude one unit:
self.opensearch_exclusions.add_voting(hosts, node_names=[sorted_cm[0]])
# Now, we clean up the sorted_cm list, as we want to be sure the new manager is elected
# and different than the excluded units.
sorted_cm.pop(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering why you chose to exclude node [0] from voting and remove it from the list? This will result in a new cluster manager node being elected when you scale up from 1 unit to more and in the process of removing the application. I just saw this locally when testing, not that it creates that much latency, but wouldn't it be better to use the last one from the list instead?

Copy link
Contributor Author

@phvalguima phvalguima Jul 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, @reneradoi yes, I noticed that as well. But when I was discussing with @Mehdi-Bendriss, we've agreed to make this list predictable instead of keeping track of the cluster manager. I could add a check here, for that, but then the other check, right before, gets slightly more complicated:

            if unit_is_stopping:
                # Remove both this unit and the first sorted_cm from the voting
                self.opensearch_exclusions.add_voting(
                    hosts, node_names=[self.unit_name, sorted_cm[0]]  ## <<<------ should we also add a check here?
                )

Given this is quite an exception (i.e. going from 1->2 or 2->1), I took the simpler approach.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can improve the comments around here, this logic is pretty brittle tbh.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I overlooked and missed a safety component there, where if we remain with 2 units - and the one being removed is not the current elected CM; maybe it makes sense to add the other unit to the voting exclusion.
This should reduce switchovers and the risk it entails

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Mehdi-Bendriss I would not recommend that. The main reason is because I noticed moving the elected manager between nodes is far faster than juju hooks. We need to be predictable in this specific case, even if it means moving the elected manager.

# We do not exclude the self.unit_name
else:
# In this case, we either are scaling down to 0 or len(cms) > 2.
# There is nothing more to do then cleanup the exclusions
return

# Now, we must be sure a new manager is elected, or there was a failure
phvalguima marked this conversation as resolved.
Show resolved Hide resolved
for attempt in Retrying(stop=stop_after_delay(300), wait=wait_fixed(10)):
with attempt:
manager = ClusterTopology.elected_manager(
ClusterTopology.nodes(self.opensearch, use_localhost=True, hosts=hosts)
)
if not manager or manager.name not in sorted_cm:
raise OpenSearchHAError("New manager not elected yet")
break

def _upgrade_opensearch(self, event: _UpgradeOpenSearch) -> None: # noqa: C901
"""Upgrade OpenSearch."""
logger.debug("Attempting to acquire lock for upgrade")
Expand Down Expand Up @@ -1176,7 +1276,7 @@ def _remove_data_role_from_dedicated_cm_if_needed( # noqa: C901
self.opensearch_config.remove_temporary_data_role()

# wait until data moves out completely
self.opensearch_exclusions.add_current()
self.opensearch_exclusions.add_allocations_exclusion()

try:
for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(0.5)):
Expand All @@ -1189,7 +1289,7 @@ def _remove_data_role_from_dedicated_cm_if_needed( # noqa: C901
raise Exception
return True
except RetryError:
self.opensearch_exclusions.delete_current()
self.opensearch_exclusions.delete_allocations_exclusion()
event.defer()
return False

Expand Down
4 changes: 2 additions & 2 deletions lib/charms/opensearch/v0/opensearch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ def update_host_if_needed(self) -> bool:
),
NetworkHost(
"network.publish_host",
set(node.get("network.publish_host", [])),
set(self._opensearch.host),
node.get("network.publish_host"),
self._opensearch.host,
),
]:
if not host.old:
Expand Down
21 changes: 20 additions & 1 deletion lib/charms/opensearch/v0/opensearch_locking.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,28 @@ def acquired(self) -> bool: # noqa: C901
self._opensearch, use_localhost=host is not None, hosts=alt_hosts
)
)
try:
logger.debug(
"Current shard allocation status: %s",
self._opensearch.request(
"GET",
"/_cluster/allocation/explain?include_yes_decisions=true&include_disk_info=true",
payload={
"index": self.OPENSEARCH_INDEX,
"shard": 0,
"primary": "true",
},
),
)
except Exception:
logger.debug("Current shard allocation status: error to connect with API")
pass

except OpenSearchHttpError:
logger.exception("Error getting OpenSearch nodes")
return False
# If we are trying to acquire the lock at application removal, this condition
# will be eventually hit
return len(self.units) <= 1
logger.debug(f"[Node lock] Opensearch {online_nodes=}")
assert online_nodes > 0
try:
Expand Down
Loading
Loading