SciSharp · SanftMonster · Dec 14, 2023 · Dec 14, 2023 · Dec 15, 2023 · Dec 21, 2023
diff --git a/LLama.Examples/Examples/CustomGenerationControl.cs b/LLama.Examples/Examples/CustomGenerationControl.cs
@@ -0,0 +1,59 @@
+using LLama.Abstractions;
+using LLama.Common;
+using LLama.Control;
+using LLama.Examples.Extensions;
+
+namespace LLama.Examples.Examples
+{
+    public class CustomGenerationControl
+    {
+        public class NumberGenerationControl: IGenerationControl
+        {
+            public bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, string lastOutputText)
+            {
+                return lastOutputText.Any(x => char.IsDigit(x) && (x == '4' || x == '5'));
+            }
+
+            public bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, int lastOutputId)
+            {
+                return false;
+            }
+        }
+        public static async Task Run()
+        {
+            Console.Write("Please input your model path: ");
+            var modelPath = Console.ReadLine();
+
+            var parameters = new ModelParams(modelPath)
+            {
+                ContextSize = 1024,
+                Seed = 1337,
+                GpuLayerCount = 5
+            };
+            using var model = LLamaWeights.LoadFromFile(parameters);
+            var ex = new StatelessExecutor(model, parameters);
+
+            Console.ForegroundColor = ConsoleColor.Yellow;
+            Console.WriteLine("This is an example to show how to customize the generation control of the executors. Here we implement a control mode in which" +
+                " the generation will stop once there's a number 4 or 5 is generated. Please try different questions to lead the model to generate answers with and without numbers." +
+                " No anti-prompt is used in this example.");
+            Console.ForegroundColor = ConsoleColor.White;
+
+            var inferenceParams = new InferenceParams() { Temperature = 0.6f, MaxTokens = 60, GenerationControl = new NumberGenerationControl() };
+
+            while (true)
+            {
+                Console.Write("\nQuestion: ");
+                Console.ForegroundColor = ConsoleColor.Green;
+                var prompt = Console.ReadLine();
+                Console.ForegroundColor = ConsoleColor.White;
+                Console.Write("Answer: ");
+                prompt = $"Question: {prompt?.Trim()} Answer: ";
+                await foreach (var text in ex.InferAsync(prompt, inferenceParams).Spinner())
+                {
+                    Console.Write(text);
+                }
+            }
+        }
+    }
+}
diff --git a/LLama.Examples/Examples/Runner.cs b/LLama.Examples/Examples/Runner.cs
@@ -13,6 +13,7 @@ public class Runner
         { "Interactive mode chat by using executor.", InteractiveModeExecute.Run },
         { "Instruct mode chat by using executor.", InstructModeExecute.Run },
         { "Stateless mode chat by using executor.", StatelessModeExecute.Run },
+        { "Customize the generation control of executor.", CustomGenerationControl.Run }, 
         { "Load and save chat session.", SaveAndLoadSession.Run },
         { "Load and save state of model and executor.", LoadAndSaveState.Run },
         { "Get embeddings from LLama model.", () => Task.Run(GetEmbeddings.Run) },

diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
@@ -1,5 +1,6 @@
 using System.Text;
 using LLama.Common;
+using LLama.Control;
 using LLama.Extensions;
 
 namespace LLama.Unittest;

diff --git a/LLama.Web/Common/InferenceOptions.cs b/LLama.Web/Common/InferenceOptions.cs
@@ -4,6 +4,7 @@
 using LLama.Abstractions;
 using LLama.Native;
 using LLama.Sampling;
+using LLama.Control;
 
 namespace LLama.Web.Common
 {
@@ -71,5 +72,8 @@ public class InferenceOptions
 
         /// <inheritdoc />
         public ISamplingPipeline? SamplingPipeline { get; set; }
+
+        /// <inheritdoc />
+        public IGenerationControl? GenerationControl { get; set; }
     }
 }
diff --git a/LLama/Abstractions/IInferenceParams.cs b/LLama/Abstractions/IInferenceParams.cs
@@ -1,5 +1,6 @@
 using System.Collections.Generic;
 using LLama.Common;
+using LLama.Control;
 using LLama.Native;
 using LLama.Sampling;
 
@@ -114,5 +115,10 @@ public interface IInferenceParams
 		/// Set a custom sampling pipeline to use. <b>If this is set All other sampling parameters are ignored!</b>
 		/// </summary>
 		ISamplingPipeline? SamplingPipeline { get; set; }
-	}
+
+        /// <summary>
+        /// Set a custom generation control to use. <b>If this is set antiprompt will be ignored!</b>
+        /// </summary>
+        IGenerationControl? GenerationControl { get; set; }
+    }
 }
diff --git a/LLama/Common/InferenceParams.cs b/LLama/Common/InferenceParams.cs
@@ -3,6 +3,7 @@
 using System.Collections.Generic;
 using LLama.Native;
 using LLama.Sampling;
+using LLama.Control;
 
 namespace LLama.Common
 {
@@ -80,6 +81,9 @@ public record InferenceParams
 
         /// <inheritdoc />
         public ISamplingPipeline? SamplingPipeline { get; set; }
+
+        /// <inheritdoc />
+        public IGenerationControl? GenerationControl { get; set; }
     }
 
     /// <summary>

diff --git a/LLama/AntipromptProcessor.cs → LLama/Control/AntipromptProcessor.cs b/LLama/AntipromptProcessor.cs → LLama/Control/AntipromptProcessor.cs
@@ -1,7 +1,7 @@
 using System;
 using System.Collections.Generic;
 
-namespace LLama
+namespace LLama.Control
 {
     /// <summary>
     /// AntipromptProcessor keeps track of past tokens looking for any set Anti-Prompts

diff --git a/LLama/Control/DefaultGenerationControl.cs b/LLama/Control/DefaultGenerationControl.cs
@@ -0,0 +1,42 @@
+using LLama.Abstractions;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Control
+{
+    /// <summary>
+    /// The default generation control in LLamaSharp, using antiprompts. This class should not be inherited. 
+    /// <b>Note that this class has state. The previous outputs feeded to it will affect its control.</b>
+    /// If you use it in a session, please don't reuse it for another session unless you intend to do so.
+    /// </summary>
+    public sealed class DefaultGenerationControl : IGenerationControl
+    {
+        private AntipromptProcessor _antipromptProcessor;
+
+        /// <summary>
+        /// <inheritdoc/>
+        /// </summary>
+        public DefaultGenerationControl()
+        {
+            _antipromptProcessor = new AntipromptProcessor();
+        }
+
+        /// <summary>
+        /// <inheritdoc/>
+        /// </summary>
+        public bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, string lastOutputText)
+        {
+            _antipromptProcessor.SetAntiprompts(inferenceParams.AntiPrompts);
+            return _antipromptProcessor.Add(lastOutputText);
+        }
+
+        /// <summary>
+        /// <inheritdoc/>
+        /// </summary>
+        public bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, int lastOutputId)
+        {
+            return context.IsEOS(lastOutputId);
+        }
+    }
+}
diff --git a/LLama/Control/IGenerationControl.cs b/LLama/Control/IGenerationControl.cs
@@ -0,0 +1,35 @@
+using LLama.Abstractions;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace LLama.Control
+{
+    /// <summary>
+    /// Control the text generation of LLama Executors.
+    /// </summary>
+    public interface IGenerationControl
+    {
+        /// <summary>
+        /// Use the last output text to determine if the generation should stop.
+        /// This method will be called after the overload with output id.
+        /// The text will be returned even if this returns true but the generation will be stopped.
+        /// </summary>
+        /// <param name="context">The LLamaContext used in the current generation.</param>
+        /// <param name="inferenceParams">The inference params used in the current generation.</param>
+        /// <param name="lastOutputText">The last output text generated by the model.</param>
+        /// <returns></returns>
+        bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, string lastOutputText);
+
+        /// <summary>
+        /// Use the last output token to determine if the generation should stop.
+        /// This method will be called before the overload with output text.
+        /// The token will be returned even if this returns true but the generation will be stopped.
+        /// </summary>
+        /// <param name="context">The LLamaContext used in the current generation.</param>
+        /// <param name="inferenceParams">The inference params used in the current generation.</param>
+        /// <param name="lastOutputId">The last output token generated by the model.</param>
+        /// <returns></returns>
+        bool ShouldStopGeneration(LLamaContext context, IInferenceParams inferenceParams, int lastOutputId);
+    }
+}
diff --git a/LLama/Extensions/GenerationControlExtensions.cs b/LLama/Extensions/GenerationControlExtensions.cs
@@ -0,0 +1,27 @@
+using LLama.Abstractions;
+using LLama.Control;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace LLama.Extensions
+{
+    /// <summary>
+    /// Extension methods for generation control
+    /// </summary>
+    public static class GenerationControlExtensions
+    {
+        public static bool ShouldStopGeneration(this IGenerationControl control, LLamaContext context, IInferenceParams inferenceParams, IEnumerable<int> lastOutputIds)
+        {
+            foreach (var id in lastOutputIds)
+            {
+                if(control.ShouldStopGeneration(context, inferenceParams, id))
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }
+}
diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
@@ -87,6 +87,15 @@ public LLamaContext(LLamaWeights model, IContextParams @params, ILogger? logger
         }
 
         /// <summary>
+        /// Return if a token marks the end of a sentence.
+        /// </summary>
+        /// <param name="token"></param>
+        /// <returns></returns>
+        public bool IsEOS(int token)
+        {
+            return NativeApi.llama_token_eos(this.NativeHandle.ModelHandle) == token;
+        }
+
         /// Set the seed for the RNG
         /// </summary>
         /// <param name="seed"></param>

diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
@@ -60,6 +60,11 @@ public abstract class StatefulExecutorBase : ILLamaExecutor
         /// The last tokens generated by the model.
         /// </summary>
         protected FixedSizeQueue<llama_token> _last_n_tokens;
+
+        /// <summary>
+        /// The last output text generated by the model.
+        /// </summary>
+        protected string _lastOutputText = string.Empty;
         /// <summary>
         /// The context used by the executor.
         /// </summary>
@@ -299,9 +304,11 @@ public virtual async IAsyncEnumerable<string> InferAsync(string text, IInference
                 if (args.ReturnValue)
                 {
                     _decoder.AddRange(_embeds);
-                    yield return _decoder.Read();
+                    _lastOutputText = _decoder.Read();
+                    yield return _lastOutputText;
                 }
 
+                // TODO(Rinne): Refactor the logic here.
                 var (breakGeneration, extraOutputs) = await PostProcess(inferenceParams, args);
                 if (extraOutputs is { Count: > 0 })
                 {

diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
@@ -10,6 +10,7 @@
 using System.Threading.Tasks;
 using LLama.Extensions;
 using Microsoft.Extensions.Logging;
+using LLama.Control;
 
 namespace LLama
 {
@@ -21,6 +22,7 @@ public class InteractiveExecutor : StatefulExecutorBase
     {
         private bool _is_prompt_run = true;
         private readonly llama_token _llama_token_newline;
+        private IGenerationControl _control;
 
         /// <summary>
         /// 
@@ -31,6 +33,7 @@ public InteractiveExecutor(LLamaContext context, ILogger? logger = null)
             : base(context, logger)
         {
             _llama_token_newline = NativeApi.llama_token_nl(Context.NativeHandle.ModelHandle);
+            _control = new DefaultGenerationControl();
         }
 
         /// <inheritdoc />
@@ -134,8 +137,17 @@ protected override Task PreprocessInputs(string text, InferStateArgs args)
         {
             if (_embed_inps.Count <= _consumedTokensCount)
             {
-                if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))
+                var control = inferenceParams.GenerationControl ?? _control;
+                // Get stop signal by ids
+                if(control.ShouldStopGeneration(Context, inferenceParams, _embeds))
+                {
                     args.WaitForInput = true;
+                }
+                // Get stop signal by text
+                else if (control.ShouldStopGeneration(Context, inferenceParams, _lastOutputText))
+                {
+                    args.WaitForInput = true;
+                }
 
                 if (_pastTokensCount > 0 && args.WaitForInput)
                     return (true, Array.Empty<string>());

diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -8,6 +8,7 @@
 using System.Threading.Tasks;
 using LLama.Native;
 using LLama.Sampling;
+using LLama.Control;
 using Microsoft.Extensions.Logging;
 
 namespace LLama
@@ -66,8 +67,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                 throw new ArgumentOutOfRangeException(nameof(inferenceParams), $"TokensKeep ({inferenceParams.TokensKeep}) cannot be larger than ContextSize ({Context.ContextSize})");
 
             // Create decoders for the token stream
+            IGenerationControl control = inferenceParams.GenerationControl ?? new DefaultGenerationControl();
             var decoder = new StreamingTokenDecoder(Context);
-            var antiprocessor = new AntipromptProcessor(inferenceParams.AntiPrompts);
 
             // Keep track of the last N tokens emitted
             var repeat_last_n = Math.Max(0, inferenceParams.RepeatLastTokensCount <0 ? _weights.ContextSize : inferenceParams.RepeatLastTokensCount);
@@ -113,12 +114,17 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                 var decoded = decoder.Read();
                 yield return decoded;
 
-                // Check if any of the antiprompts have been generated
-                if (antiprocessor.Add(decoded))
-                    break;
-
                 lastTokens.Add(id);
                 tokens.Clear();
+
+                // Check if we should steop generation by ids
+                if (control.ShouldStopGeneration(context, inferenceParams, id))
+                    break;
+                // Check if we should steop generation by text
+                if (control.ShouldStopGeneration(Context, inferenceParams, decoded))
+                    break;
+
+                // prepare for the next loop
                 tokens.Add(id);
 
                 // when run out of context