Merge pull request #1534 from BerriAI/litellm_custom_cooldown_times

[Feat] Litellm.Router set custom cooldown times
BerriAI · Jan 20, 2024 · 435d4b9 · 435d4b9
2 parents ccbc471 + 5295168
commit 435d4b9
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 6 deletions.
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
@@ -603,10 +603,11 @@ def __init__(
 	timeout: Optional[float] = None,
 	default_litellm_params={},  # default params for Router.chat.completion.create
 	fallbacks: List = [],
-	allowed_fails: Optional[int] = None,
+	allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
+	cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
 	context_window_fallbacks: List = [],
 	model_group_alias: Optional[dict] = {},
-	retry_after: int = 0,  # min time to wait before retrying a failed request
+	retry_after: int = 0,  # (min) time to wait before retrying a failed request
 	routing_strategy: Literal[
 		"simple-shuffle",
 		"least-busy",

diff --git a/litellm/router.py b/litellm/router.py
@@ -96,10 +96,13 @@ def __init__(
         set_verbose: bool = False,
         debug_level: Literal["DEBUG", "INFO"] = "INFO",
         fallbacks: List = [],
-        allowed_fails: Optional[int] = None,
         context_window_fallbacks: List = [],
         model_group_alias: Optional[dict] = {},
         retry_after: int = 0,  # min time to wait before retrying a failed request
+        allowed_fails: Optional[
+            int
+        ] = None,  # Number of times a deployment can failbefore being added to cooldown
+        cooldown_time: float = 1,  # (seconds) time to cooldown a deployment after failure
         routing_strategy: Literal[
             "simple-shuffle",
             "least-busy",
@@ -108,6 +111,36 @@ def __init__(
         ] = "simple-shuffle",
         routing_strategy_args: dict = {},  # just for latency-based routing
     ) -> None:
+        """
+        Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
+
+        Args:
+            model_list (Optional[list]): List of models to be used. Defaults to None.
+            redis_url (Optional[str]): URL of the Redis server. Defaults to None.
+            redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
+            redis_port (Optional[int]): Port of the Redis server. Defaults to None.
+            redis_password (Optional[str]): Password of the Redis server. Defaults to None.
+            cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
+            cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
+            caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
+            client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
+            num_retries (int): Number of retries for failed requests. Defaults to 0.
+            timeout (Optional[float]): Timeout for requests. Defaults to None.
+            default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
+            set_verbose (bool): Flag to set verbose mode. Defaults to False.
+            debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
+            fallbacks (List): List of fallback options. Defaults to [].
+            context_window_fallbacks (List): List of context window fallback options. Defaults to [].
+            model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
+            retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
+            allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
+            cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
+            routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
+            routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
+
+        Returns:
+            Router: An instance of the litellm.Router class.
+        """
         self.set_verbose = set_verbose
         if self.set_verbose:
             if debug_level == "INFO":
@@ -163,6 +196,7 @@ def __init__(
                 self.deployment_latency_map[m["litellm_params"]["model"]] = 0
 
         self.allowed_fails = allowed_fails or litellm.allowed_fails
+        self.cooldown_time = cooldown_time or 1
         self.failed_calls = (
             InMemoryCache()
         )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
@@ -1365,6 +1399,7 @@ def _set_cooldown_deployments(self, deployment: Optional[str] = None):
         verbose_router_logger.debug(
             f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
         )
+        cooldown_time = self.cooldown_time or 1
         if updated_fails > self.allowed_fails:
             # get the current cooldown list for that minute
             cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls
@@ -1378,13 +1413,19 @@ def _set_cooldown_deployments(self, deployment: Optional[str] = None):
                 else:
                     cached_value = cached_value + [deployment]
                     # save updated value
-                    self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
+                    self.cache.set_cache(
+                        value=cached_value, key=cooldown_key, ttl=cooldown_time
+                    )
             except:
                 cached_value = [deployment]
                 # save updated value
-                self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
+                self.cache.set_cache(
+                    value=cached_value, key=cooldown_key, ttl=cooldown_time
+                )
         else:
-            self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1)
+            self.failed_calls.set_cache(
+                key=deployment, value=updated_fails, ttl=cooldown_time
+            )
 
     def _get_cooldown_deployments(self):
         """

diff --git a/litellm/tests/test_router_fallbacks.py b/litellm/tests/test_router_fallbacks.py
@@ -796,3 +796,93 @@ def get_anthropic_params(model: str):
 
     except Exception as e:
         pytest.fail(f"An exception occurred {e}")
+
+
+def test_custom_cooldown_times():
+    try:
+        # set, custom_cooldown. Failed model in cooldown_models, after custom_cooldown, the failed model is no longer in cooldown_models
+
+        model_list = [
+            {  # list of model deployments
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": "bad-key",
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+                "tpm": 24000000,
+            },
+            {  # list of model deployments
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                },
+                "tpm": 1,
+            },
+        ]
+
+        litellm.set_verbose = False
+
+        router = Router(
+            model_list=model_list,
+            set_verbose=True,
+            debug_level="INFO",
+            cooldown_time=0.1,
+            redis_host=os.getenv("REDIS_HOST"),
+            redis_password=os.getenv("REDIS_PASSWORD"),
+            redis_port=int(os.getenv("REDIS_PORT")),
+        )
+
+        # make a request - expect it to fail
+        try:
+            response = router.completion(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "content": "Tell me a joke.",
+                        "role": "user",
+                    }
+                ],
+            )
+        except:
+            pass
+
+        # expect 1 model to be in cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print("cooldown_deployments after failed call: ", cooldown_deployments)
+        assert (
+            len(cooldown_deployments) == 1
+        ), "Expected 1 model to be in cooldown models"
+
+        selected_cooldown_model = cooldown_deployments[0]
+
+        # wait for 1/2 of cooldown time
+        time.sleep(router.cooldown_time / 2)
+
+        # expect cooldown model to still be in cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print(
+            "cooldown_deployments after waiting 1/2 of cooldown: ", cooldown_deployments
+        )
+        assert (
+            len(cooldown_deployments) == 1
+        ), "Expected 1 model to be in cooldown models"
+
+        # wait for 1/2 of cooldown time again, now we've waited for full cooldown
+        time.sleep(router.cooldown_time / 2)
+
+        # expect cooldown model to be removed from cooldown models
+        cooldown_deployments = router._get_cooldown_deployments()
+        print(
+            "cooldown_deployments after waiting cooldown time: ", cooldown_deployments
+        )
+        assert (
+            len(cooldown_deployments) == 0
+        ), "Expected 0 models to be in cooldown models"
+
+    except Exception as e:
+        print(e)