Skip to content

Commit

Permalink
Merge pull request ggml-org#64 from jm12138/add_unlimited_max_tokens
Browse files Browse the repository at this point in the history
Add unlimited max_tokens
  • Loading branch information
abetlen authored Jul 8, 2023
2 parents 236c4cf + 5d756de commit 2472420
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,15 @@ def _create_completion(
if self.verbose:
llama_cpp.llama_reset_timings(self.ctx)

if len(prompt_tokens) > self._n_ctx:
if max_tokens <= 0:
# Unlimited, depending on n_ctx.
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
raise ValueError(
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
)
else:
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
raise ValueError(
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
)
Expand Down Expand Up @@ -1231,7 +1239,7 @@ def create_completion(
Args:
prompt: The prompt to generate text from.
suffix: A suffix to append to the generated text. If None, no suffix is appended.
max_tokens: The maximum number of tokens to generate.
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
temperature: The temperature to use for sampling.
top_p: The top-p value to use for sampling.
logprobs: The number of logprobs to return. If None, no logprobs are returned.
Expand Down Expand Up @@ -1304,7 +1312,7 @@ def __call__(
Args:
prompt: The prompt to generate text from.
suffix: A suffix to append to the generated text. If None, no suffix is appended.
max_tokens: The maximum number of tokens to generate.
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
temperature: The temperature to use for sampling.
top_p: The top-p value to use for sampling.
logprobs: The number of logprobs to return. If None, no logprobs are returned.
Expand Down Expand Up @@ -1432,7 +1440,7 @@ def create_chat_completion(
top_k: The top-k value to use for sampling.
stream: Whether to stream the results.
stop: A list of strings to stop generation when encountered.
max_tokens: The maximum number of tokens to generate.
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
repeat_penalty: The penalty to apply to repeated tokens.
Returns:
Expand Down

0 comments on commit 2472420

Please sign in to comment.