Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] Litellm.Router set custom cooldown times #1534

Merged
merged 3 commits into from
Jan 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/my-website/docs/routing.md
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,11 @@ def __init__(
timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create
fallbacks: List = [],
allowed_fails: Optional[int] = None,
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request
retry_after: int = 0, # (min) time to wait before retrying a failed request
routing_strategy: Literal[
"simple-shuffle",
"least-busy",
Expand Down
49 changes: 45 additions & 4 deletions litellm/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,13 @@ def __init__(
set_verbose: bool = False,
debug_level: Literal["DEBUG", "INFO"] = "INFO",
fallbacks: List = [],
allowed_fails: Optional[int] = None,
context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request
allowed_fails: Optional[
int
] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
routing_strategy: Literal[
"simple-shuffle",
"least-busy",
Expand All @@ -108,6 +111,36 @@ def __init__(
] = "simple-shuffle",
routing_strategy_args: dict = {}, # just for latency-based routing
) -> None:
"""
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.

Args:
model_list (Optional[list]): List of models to be used. Defaults to None.
redis_url (Optional[str]): URL of the Redis server. Defaults to None.
redis_host (Optional[str]): Hostname of the Redis server. Defaults to None.
redis_port (Optional[int]): Port of the Redis server. Defaults to None.
redis_password (Optional[str]): Password of the Redis server. Defaults to None.
cache_responses (Optional[bool]): Flag to enable caching of responses. Defaults to False.
cache_kwargs (dict): Additional kwargs to pass to RedisCache. Defaults to {}.
caching_groups (Optional[List[tuple]]): List of model groups for caching across model groups. Defaults to None.
client_ttl (int): Time-to-live for cached clients in seconds. Defaults to 3600.
num_retries (int): Number of retries for failed requests. Defaults to 0.
timeout (Optional[float]): Timeout for requests. Defaults to None.
default_litellm_params (dict): Default parameters for Router.chat.completion.create. Defaults to {}.
set_verbose (bool): Flag to set verbose mode. Defaults to False.
debug_level (Literal["DEBUG", "INFO"]): Debug level for logging. Defaults to "INFO".
fallbacks (List): List of fallback options. Defaults to [].
context_window_fallbacks (List): List of context window fallback options. Defaults to [].
model_group_alias (Optional[dict]): Alias for model groups. Defaults to {}.
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.

Returns:
Router: An instance of the litellm.Router class.
"""
self.set_verbose = set_verbose
if self.set_verbose:
if debug_level == "INFO":
Expand Down Expand Up @@ -163,6 +196,7 @@ def __init__(
self.deployment_latency_map[m["litellm_params"]["model"]] = 0

self.allowed_fails = allowed_fails or litellm.allowed_fails
self.cooldown_time = cooldown_time or 1
self.failed_calls = (
InMemoryCache()
) # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
Expand Down Expand Up @@ -1247,6 +1281,7 @@ def _set_cooldown_deployments(self, deployment: Optional[str] = None):
verbose_router_logger.debug(
f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
)
cooldown_time = self.cooldown_time or 1
if updated_fails > self.allowed_fails:
# get the current cooldown list for that minute
cooldown_key = f"{current_minute}:cooldown_models" # group cooldown models by minute to reduce number of redis calls
Expand All @@ -1260,13 +1295,19 @@ def _set_cooldown_deployments(self, deployment: Optional[str] = None):
else:
cached_value = cached_value + [deployment]
# save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
except:
cached_value = [deployment]
# save updated value
self.cache.set_cache(value=cached_value, key=cooldown_key, ttl=1)
self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time
)
else:
self.failed_calls.set_cache(key=deployment, value=updated_fails, ttl=1)
self.failed_calls.set_cache(
key=deployment, value=updated_fails, ttl=cooldown_time
)

def _get_cooldown_deployments(self):
"""
Expand Down
90 changes: 90 additions & 0 deletions litellm/tests/test_router_fallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,3 +796,93 @@ def get_anthropic_params(model: str):

except Exception as e:
pytest.fail(f"An exception occurred {e}")


def test_custom_cooldown_times():
try:
# set, custom_cooldown. Failed model in cooldown_models, after custom_cooldown, the failed model is no longer in cooldown_models

model_list = [
{ # list of model deployments
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": "bad-key",
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 24000000,
},
{ # list of model deployments
"model_name": "gpt-3.5-turbo", # openai model name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"tpm": 1,
},
]

litellm.set_verbose = False

router = Router(
model_list=model_list,
set_verbose=True,
debug_level="INFO",
cooldown_time=0.1,
redis_host=os.getenv("REDIS_HOST"),
redis_password=os.getenv("REDIS_PASSWORD"),
redis_port=int(os.getenv("REDIS_PORT")),
)

# make a request - expect it to fail
try:
response = router.completion(
model="gpt-3.5-turbo",
messages=[
{
"content": "Tell me a joke.",
"role": "user",
}
],
)
except:
pass

# expect 1 model to be in cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print("cooldown_deployments after failed call: ", cooldown_deployments)
assert (
len(cooldown_deployments) == 1
), "Expected 1 model to be in cooldown models"

selected_cooldown_model = cooldown_deployments[0]

# wait for 1/2 of cooldown time
time.sleep(router.cooldown_time / 2)

# expect cooldown model to still be in cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print(
"cooldown_deployments after waiting 1/2 of cooldown: ", cooldown_deployments
)
assert (
len(cooldown_deployments) == 1
), "Expected 1 model to be in cooldown models"

# wait for 1/2 of cooldown time again, now we've waited for full cooldown
time.sleep(router.cooldown_time / 2)

# expect cooldown model to be removed from cooldown models
cooldown_deployments = router._get_cooldown_deployments()
print(
"cooldown_deployments after waiting cooldown time: ", cooldown_deployments
)
assert (
len(cooldown_deployments) == 0
), "Expected 0 models to be in cooldown models"

except Exception as e:
print(e)