Skip to content

Commit

Permalink
Handle discovery failure stuck (#1630)
Browse files Browse the repository at this point in the history
Co-authored-by: arik <alon.arik@gmail.com>
  • Loading branch information
moshemorad and arikalon1 authored Nov 17, 2024
1 parent b91e89f commit fc3b062
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
3 changes: 1 addition & 2 deletions src/robusta/core/discovery/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def __create_service_info(
)

@staticmethod

def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSet, Pod, ReplicaSet]) -> ServiceInfo:
return Discovery.__create_service_info_from_hikaru(
obj.metadata,
Expand All @@ -187,7 +186,7 @@ def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSe
def discovery_process() -> DiscoveryResults:
create_monkey_patches()
Discovery.stacktrace_thread_active = True
threading.Thread(target=Discovery.stack_dump_on_signal).start()
threading.Thread(target=Discovery.stack_dump_on_signal, daemon=True).start()
pods_metadata: List[V1ObjectMeta] = []
node_requests = defaultdict(list) # map between node name, to request of pods running on it
active_services: List[ServiceInfo] = []
Expand Down
44 changes: 44 additions & 0 deletions tests/discovery/test_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import signal
from concurrent.futures import ProcessPoolExecutor
from contextlib import contextmanager
from http import HTTPStatus
from typing import Any, Generator, NoReturn
from unittest.mock import patch

import kubernetes
import pytest
from kubernetes.client.exceptions import ApiException

from robusta.core.discovery.discovery import Discovery


# pytest-timeout requires pytest>=7, https://github.com/pytest-dev/pytest-timeout/blob/main/setup.cfg
@contextmanager
def time_limit(seconds: int) -> Generator[None, Any, None]:
def signal_handler(_signum: Any, _frame: Any) -> NoReturn:
pytest.fail("Test took to much time...")

signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)


def _patch_worker() -> None:
def _patched(self: Any, **_: Any) -> NoReturn:
raise ApiException(HTTPStatus.INTERNAL_SERVER_ERROR, reason="Internal Server Error")

kubernetes.client.CoreV1Api.list_node = _patched


def test_discovery_recovery_on_failure():
with time_limit(20):
patched_pool = ProcessPoolExecutor(1, initializer=_patch_worker)
with patch.object(Discovery, "executor", new=patched_pool):
with pytest.raises(ApiException):
Discovery.discover_resources()

assert patched_pool._shutdown_thread
assert not Discovery.executor._shutdown_thread

0 comments on commit fc3b062

Please sign in to comment.