From 841019794a15027b13570eb3fd9ae7a133267f24 Mon Sep 17 00:00:00 2001 From: Jon Perry Date: Wed, 30 Aug 2023 16:48:31 +0000 Subject: [PATCH 1/2] Fix bug with stablelm CompleteStream Minor cleanup of stablelm and ctransformers code --- models/llms/ctransformers/requirements.txt | 3 +- models/llms/ctransformers/test_stream.py | 2 +- models/llms/stablelm/stablelm.py | 43 ++++++++++++---------- models/llms/stablelm/test_stream.py | 2 +- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/models/llms/ctransformers/requirements.txt b/models/llms/ctransformers/requirements.txt index 4803a9a0e..4472a7c07 100644 --- a/models/llms/ctransformers/requirements.txt +++ b/models/llms/ctransformers/requirements.txt @@ -1,2 +1,3 @@ transformers -torch \ No newline at end of file +torch +ctransformers \ No newline at end of file diff --git a/models/llms/ctransformers/test_stream.py b/models/llms/ctransformers/test_stream.py index 6c2353d2a..8ff5ce2ea 100644 --- a/models/llms/ctransformers/test_stream.py +++ b/models/llms/ctransformers/test_stream.py @@ -35,7 +35,7 @@ def run(): response: Iterator[leapfrogai.CompletionResponse] = stub.CompleteStream(request) for completion in response: - print(completion.choices[0].text, end="") + print(completion.choices[0].text, end="", flush=True) if __name__ == "__main__": diff --git a/models/llms/stablelm/stablelm.py b/models/llms/stablelm/stablelm.py index e0ab431da..935062a79 100644 --- a/models/llms/stablelm/stablelm.py +++ b/models/llms/stablelm/stablelm.py @@ -17,6 +17,7 @@ CompletionRequest, CompletionResponse, CompletionServiceServicer, + CompletionStreamServiceServicer, GrpcContext, serve, ) @@ -32,49 +33,51 @@ def __call__( return True return False - -class StableLM(CompletionServiceServicer): - torch.cuda.init() - if torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) - model = AutoModelForCausalLM.from_pretrained(MODEL_ID) - model.half().cuda() - print("StableLM Loaded...") +class StableLM(CompletionServiceServicer, CompletionStreamServiceServicer): + def __init__(self): + torch.cuda.init() + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) + self.model = AutoModelForCausalLM.from_pretrained(MODEL_ID) + self.model.half().cuda() + print("StableLM Loaded...") def Complete( self, request: CompletionRequest, context: GrpcContext ) -> CompletionResponse: - print(f"Request: { request }") - inputs = self.tokenizer(request.prompt, return_tensors="pt").to(torch.cuda.current_device()) + logging.debug(f"Request: { request }") + inputs = self.tokenizer(request.prompt, return_tensors="pt").to(self.device) + # error checking for valid params tokens = self.model.generate( **inputs, max_new_tokens=request.max_new_tokens, temperature=request.temperature, - # repetition_penalty=request.frequence_penalty, - # top_p=request.top_p, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id, stopping_criteria=StoppingCriteriaList([StopOnTokens()]), ) logging.debug(f"Response {tokens}") + # Extract out only the completion tokens completion_tokens = tokens[0][inputs["input_ids"].size(1) :] completion = self.tokenizer.decode(completion_tokens, skip_special_tokens=True) - c = CompletionChoice(text=completion, index=0) logging.debug(f"Decoded Response: {completion}") + return CompletionResponse(choices=[c]) + def CompleteStream(self, request: CompletionRequest, context: GrpcContext): - inputs = self.tokenizer(request.prompt, return_tensors="pt").to(self.device) logging.debug(f"Request: { request }") + inputs = self.tokenizer(request.prompt, return_tensors="pt").to(self.device) streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True) + generation_kwargs = dict( inputs, streamer=streamer, @@ -86,12 +89,12 @@ def CompleteStream(self, request: CompletionRequest, context: GrpcContext): eos_token_id=self.tokenizer.eos_token_id, stopping_criteria=StoppingCriteriaList([StopOnTokens()]), ) + thread = Thread(target=self.model.generate, kwargs=generation_kwargs) thread.start() for text in streamer: - print(text) - yield text - # logging.debug(f"Response {tokens}") + completion = CompletionChoice(text=text, index=0) + yield CompletionResponse(choices=[completion]) if __name__ == "__main__": diff --git a/models/llms/stablelm/test_stream.py b/models/llms/stablelm/test_stream.py index bc8116d42..3010f16f1 100644 --- a/models/llms/stablelm/test_stream.py +++ b/models/llms/stablelm/test_stream.py @@ -37,7 +37,7 @@ def run(): response: Iterator[leapfrogai.CompletionResponse] = stub.CompleteStream(request) for completion in response: - print(completion.choices[0].text, end="") + print(completion.choices[0].text, end="", flush=True) if __name__ == "__main__": From 5498cb7699877e3d1ddd203df2cbddcb1aa6b15e Mon Sep 17 00:00:00 2001 From: Jon Perry Date: Wed, 30 Aug 2023 17:00:20 +0000 Subject: [PATCH 2/2] fix system prompt for ctransformers tests --- models/llms/ctransformers/test.py | 2 +- models/llms/ctransformers/test_stream.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/models/llms/ctransformers/test.py b/models/llms/ctransformers/test.py index d281515d3..699446389 100644 --- a/models/llms/ctransformers/test.py +++ b/models/llms/ctransformers/test.py @@ -14,7 +14,7 @@ You are an AI assistant that answers participates in chat discussions in an honest, concise, friendly way.<|im_end|> <|im_start|>user Write two sequences composed of 3 'A's and 2 'B's such that there are no two successive identical letter. Be concise.<|im_end|> -<|im_assistant|> +<|im_start|>assistant """ def run(): diff --git a/models/llms/ctransformers/test_stream.py b/models/llms/ctransformers/test_stream.py index 8ff5ce2ea..84e822ec3 100644 --- a/models/llms/ctransformers/test_stream.py +++ b/models/llms/ctransformers/test_stream.py @@ -14,7 +14,7 @@ You are an AI assistant that answers participates in chat discussions in an honest, concise, friendly way.<|im_end|> <|im_start|>user Write two sequences composed of 3 'A's and 2 'B's such that there are no two successive identical letter. Be concise.<|im_end|> -<|im_assistant|> +<|im_start|>assistant """ def run():