SciSharp · AsakusaRinne · Apr 21, 2024 · martindevans · Apr 21, 2024 · AsakusaRinne
diff --git a/LLama/Experimental/Abstractions/IModelRunner.cs b/LLama/Experimental/Abstractions/IModelRunner.cs
@@ -0,0 +1,20 @@
+using LLama.Experimental.Common;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Abstractions
+{
+    /// <summary>
+    /// It defines how to execute the model.
+    /// </summary>
+    public interface IModelRunner: IDisposable
+    {
+        /// <summary>
+        /// Deal with the scheduled sequences to get the output.
+        /// </summary>
+        /// <param name="seqGroupMetadataList"></param>
+        /// <returns></returns>
+        SamplerOutput ExecuteModel(IEnumerable<SequenceGroupMetadata> seqGroupMetadataList);
+    }
+}
diff --git a/LLama/Experimental/Abstractions/ISamplingMethod.cs b/LLama/Experimental/Abstractions/ISamplingMethod.cs
@@ -0,0 +1,44 @@
+using LLama.Experimental.Common;
+using LLama.Experimental.Runner.LLamaCpp;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Abstractions
+{
+    /// <summary>
+    /// Method to sample the model output.
+    /// </summary>
+    public interface ISamplingMethod
+        // TODO: We should reconsider this design. Maybe it's better to use `SamplingParams` to let user set, 
+        // and choose the actual sampler internally according to the params.
+    {
+        /// <summary>
+        ///The maximum number of sequences running in parallel.
+        ///
+        /// If you don't know what to return, you can return the default value.
+        /// 
+        /// Generally, if you want to select several results from n results, you need 
+        /// to set the maximum number of sequences to run.
+        /// </summary>
+        /// <param name="defaultValue"></param>
+        /// <param name="currentNumSeqs"></param>
+        /// <returns></returns>
+        int GetMaxNumRunningSeqs(int defaultValue, int currentNumSeqs);
+
+        /// <summary>
+        /// Whether to skip special tokens.
+        /// </summary>
+        bool SkipSpecialTokens { get; }
+
+        /// <summary>
+        /// Sample the sequence logits to get the token.
+        /// </summary>
+        /// <param name="logits"></param>
+        /// <param name="seqId"></param>
+        /// <param name="samplingMetadata"></param>
+        /// <returns></returns>
+        SequenceOutput SampleSequence(Span<float> logits, int seqId, SamplingMetadata samplingMetadata);
+        // TODO: maybe we shouldn't expose all the samplingMetadata to users here.
+    }
+}
diff --git a/LLama/Experimental/Abstractions/ISchedulingPolicy.cs b/LLama/Experimental/Abstractions/ISchedulingPolicy.cs
@@ -0,0 +1,21 @@
+using LLama.Experimental.Common;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Abstractions
+{
+    /// <summary>
+    /// Define the scheduling policy, which decides the priority orders of sequences.
+    /// </summary>
+    public interface ISchedulingPolicy
+    {
+        /// <summary>
+        /// Get the priority of a sequence group.
+        /// </summary>
+        /// <param name="now"></param>
+        /// <param name="seqGroup"></param>
+        /// <returns></returns>
+        int GetPriority(DateTime now, SequenceGroup seqGroup);
+    }
+}
diff --git a/LLama/Experimental/Abstractions/IStoppingCriteria.cs b/LLama/Experimental/Abstractions/IStoppingCriteria.cs
@@ -0,0 +1,30 @@
+using LLama.Experimental.Common;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Abstractions
+{
+    /// <summary>
+    /// Stopping criteria that can be applied during generation.
+    /// </summary>
+    public interface IStoppingCriteria
+    {
+        /// <summary>
+        /// Check if the sequence should be stopped and return the status.
+        /// 
+        /// If it's not supposed to be stopped, be sure to return its current status.
+        /// </summary>
+        /// <param name="seq"></param>
+        /// <returns></returns>
+        StoppingCriteriaOutput CheckStop(Sequence seq); // TODO: include other params?
+    }
+
+    /// <summary>
+    /// The output of <see cref="IStoppingCriteria.CheckStop(Sequence)"/>
+    /// </summary>
+    /// <param name="Status">The sequence status.</param>
+    /// <param name="StoppingString">If the sequence stops because of the appearance of a string, please set it here.</param>
+    /// <param name="StoppingTokenId">If the sequence stops because of the appearance of a token, please set it here.</param>
+    public record class StoppingCriteriaOutput(SequenceStatus Status, string? StoppingString = null, int? StoppingTokenId = null);
+}
diff --git a/LLama/Experimental/Abstractions/ITokenizer.cs b/LLama/Experimental/Abstractions/ITokenizer.cs
@@ -0,0 +1,34 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Abstractions
+{
+    /// <summary>
+    /// The interface for tokenizer in LLamaSharp. It's responsible for converting text to token ids, or vice versa.
+    /// </summary>
+    public interface ITokenizer
+    {
+        // TODO: `ApplyChatTemplate` API
+
+        // TODO: Batched Encode?
+
+        /// <summary>
+        /// Get the token ids from the text
+        /// </summary>
+        /// <param name="input"></param>
+        /// <returns></returns>
+        IList<int> Tokenize(string input);
+
+        /// <summary>
+        /// Convert the token ids to text.
+        /// </summary>
+        /// <param name="tokenIds"></param>
+        /// <param name="result"></param>
+        /// <param name="skipSpecialTokens"></param>
+        /// <returns>The consumed tokens for decoding.</returns>
+        int ConvertIdsToText(IEnumerable<int> tokenIds, out string result, bool skipSpecialTokens = false);
+
+        // TODO: decode from Logprobs
+    }
+}
diff --git a/LLama/Experimental/Common/ModelRunnerInput.cs b/LLama/Experimental/Common/ModelRunnerInput.cs
@@ -0,0 +1,24 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Common
+{
+    /// <summary>
+    /// The input prepared for model runner.
+    /// </summary>
+    /// <param name="TokenIds">The tokens to feed to the model.</param>
+    /// <param name="Positions">The positions of these tokens.</param>
+    /// <param name="SeqIds">The sequence ids of these tokens.</param>
+    /// <param name="WithLogits">Whether the logits need to be computed for the token.</param>
+    /// <param name="PromptLengths">The lengths of the prompts if the input is at prefill stage, otherwise empty.</param>
+    /// <param name="SubqueryLengths">The lengths of the subqueries if the input is at prefill stage, otherwise empty.</param>
+    public record class ModelRunnerInput(
+        int[] TokenIds, 
+        int[] Positions, 
+        int[] SeqIds, 
+        bool[] WithLogits, 
+        int[] PromptLengths, 
+        int[] SubqueryLengths
+    );
+}
diff --git a/LLama/Experimental/Common/RequestMetrics.cs b/LLama/Experimental/Common/RequestMetrics.cs
@@ -0,0 +1,42 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Experimental.Common
+{
+    /// <summary>
+    /// Metrics associated with a request.
+    /// </summary>
+    public class RequestMetrics
+    {
+        /// <summary>
+        /// The time when the request arrived.
+        /// </summary>
+        public DateTime ArrivalTime { get; set; }
+
+        /// <summary>
+        /// The time when the request was first scheduled.
+        /// </summary>
+        public DateTime? FirstScheduledTime { get; set; }
+
+        /// <summary>
+        /// The time when the first token was generated.
+        /// </summary>
+        public DateTime? FirstTokenTime { get; set; }
+
+        /// <summary>
+        /// The time when the last token was generated.
+        /// </summary>
+        public DateTime? LastTokenTime { get; set; }
+
+        /// <summary>
+        /// The time the request spent in the queue.
+        /// </summary>
+        public TimeSpan? TimeInQueue { get; set; }
+
+        /// <summary>
+        /// The time when the request was finished.
+        /// </summary>
+        public DateTime? FinishedTime { get; set; }
+    }
+}
diff --git a/LLama/Experimental/Common/RequestOutput.cs b/LLama/Experimental/Common/RequestOutput.cs
@@ -0,0 +1,101 @@
+using LLama.Experimental.Utils;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace LLama.Experimental.Common
+{
+    /// <summary>
+    /// The output data of a request to the LLM.
+    /// </summary>
+    /// <param name="RequestId">The unique ID of the request.</param>
+    /// <param name="Prompt">The prompt string of the request.</param>
+    /// <param name="PromptTokenIds">The token IDs of the prompt.</param>
+    /// <param name="Outputs">The output sequences of the request.</param>
+    /// <param name="Finished">Whether the whole request is finished.</param>
+    /// <param name="Metrics">Metrics associated with the request.</param>
+    public record class RequestOutput(
+        string RequestId, 
+        string? Prompt, 
+        IList<int> PromptTokenIds, 
+        IList<CompletionOutput> Outputs, 
+        bool Finished, 
+        RequestMetrics Metrics
+    )
+    {
+        /// <summary>
+        /// Create an instance from <see cref="SequenceGroup"/>.
+        /// </summary>
+        /// <param name="seqGroup"></param>
+        /// <returns></returns>
+        /// <exception cref="NotImplementedException"></exception>
+        public static RequestOutput FromSeqGroup(SequenceGroup seqGroup)
+        {
+            var seqs = seqGroup.GetAllSeqs();
+            if(seqs.Count() != 1)
+            {
+                // TODO: deal with beam search here.
+                throw new NotImplementedException();
+            }
+
+            List<CompletionOutput> outputs = new();
+            int index = 0;
+            foreach(var seq in seqs)
+            {
+                outputs.Add(new CompletionOutput(index, seq.OutputText, seq.OutputTokens, 
+                    seq.Status.GetFinishedReason(), seq.StoppingString, seq.StoppingTokenId));
+                index++;
+            }
+
+            if (seqGroup.IsFinished)
+            {
+                seqGroup.SetFinishedTime(DateTime.Now);
+            }
+            return new RequestOutput(seqGroup.RequestId, seqGroup.Prompt, seqGroup.PromptTokenIds, 
+                outputs, seqGroup.IsFinished, seqGroup.Metrics);
+        }
+
+        /// <inheritdoc/>
+        public override string ToString()
+        {
+            return ClassStringFormatter.Format(this);
+        }
+    }
+
+    /// <summary>
+    /// The output data of one completion output of a request.
+    /// </summary>
+    /// <param name="Index">The index of the output in the request.</param>
+    /// <param name="Text">The generated output text.</param>
+    /// <param name="TokenIds">The token IDs of the generated output text.</param>
+    /// <param name="FinishReason">The reason why the sequence is finished.</param>
+    /// <param name="StoppingString">
+    /// The stop string that caused the completion to stop, 
+    /// Null if the completion finished for some other reason.
+    /// </param>
+    /// <param name="StoppingToken">
+    /// The stop string that caused the completion to stop, 
+    /// Null if the completion finished for some other reason.
+    /// </param>
+    public record class CompletionOutput(
+        int Index, 
+        string Text, 
+        IList<int> TokenIds, 
+        string FinishReason, 
+        string? StoppingString, 
+        int? StoppingToken
+    )
+    {
+        /// <summary>
+        /// Whether the completion has finished.
+        /// </summary>
+        public bool IsFinished => !string.IsNullOrEmpty(FinishReason);
+
+        /// <inheritdoc/>
+        public override string ToString()
+        {
+            return ClassStringFormatter.Format(this);
+        }
+    }
+}