From 045d6d7110ff6ae753603d379b586e6c128a059a Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Fri, 10 May 2024 19:19:24 +0100
Subject: [PATCH] Rewritten some examples docs, explaining what these examples
 show instead of just showing the source code.

---
 docs/Examples/BatchedExecutorFork.md     | 196 ++++++-----------------
 docs/Examples/BatchedExecutorGuidance.md | 131 +--------------
 docs/Examples/BatchedExecutorRewind.md   | 122 +-------------
 3 files changed, 55 insertions(+), 394 deletions(-)
diff --git a/docs/Examples/BatchedExecutorFork.md b/docs/Examples/BatchedExecutorFork.md
index ad391dd1c..8ec4887b3 100644
--- a/docs/Examples/BatchedExecutorFork.md
+++ b/docs/Examples/BatchedExecutorFork.md
@@ -1,148 +1,48 @@
-# Bacthed executor - multi-output to one input
-
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
-
-namespace LLama.Examples.Examples;
-
-/// <summary>
-/// This demonstrates generating multiple replies to the same prompt, with a shared cache
-/// </summary>
-public class BatchedExecutorFork
-{
-    private const int n_split = 16;
-    private const int n_len = 72;
-
-    public static async Task Run()
-    {
-        string modelPath = UserSettings.GetModelPath();
-
-        var parameters = new ModelParams(modelPath);
-        using var model = LLamaWeights.LoadFromFile(parameters);
-
-        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
-
-        // Create an executor that can evaluate a batch of conversations together
-        using var executor = new BatchedExecutor(model, parameters);
-
-        // Print some info
-        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
-        Console.WriteLine($"Created executor with model: {name}");
-
-        // Evaluate the initial prompt to create one conversation
-        using var start = executor.Create();
-        start.Prompt(prompt);
-        await executor.Infer();
-
-        // Create the root node of the tree
-        var root = new Node(start);
-
-        await AnsiConsole
-            .Progress()
-            .StartAsync(async progress =>
-            {
-                var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);
-
-                // Run inference loop
-                for (var i = 0; i < n_len; i++)
-                {
-                    if (i != 0)
-                        await executor.Infer();
-
-                    // Occasionally fork all the active conversations
-                    if (i != 0 && i % n_split == 0)
-                        root.Split();
-
-                    // Sample all active conversations
-                    root.Sample();
-
-                    // Update progress bar
-                    reporter.Increment(1);
-                    reporter.Description($"Running Inference ({root.ActiveConversationCount})");
-                }
-
-                // Display results
-                var display = new Tree(prompt);
-                root.Display(display);
-                AnsiConsole.Write(display);
-            });
-    }
-
-    private class Node
-    {
-        private readonly StreamingTokenDecoder _decoder;
-
-        private readonly DefaultSamplingPipeline _sampler;
-        private Conversation? _conversation;
-
-        private Node? _left;
-        private Node? _right;
-
-        public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;
-
-        public Node(Conversation conversation)
-        {
-            _sampler = new DefaultSamplingPipeline();
-            _conversation = conversation;
-            _decoder = new StreamingTokenDecoder(conversation.Executor.Context);
-        }
-
-        public void Sample()
-        {
-            if (_conversation == null)
-            {
-                _left?.Sample();
-                _right?.Sample();
-                return;
-            }
-
-            if (_conversation.RequiresInference)
-                return;
-
-            // Sample one token
-            var ctx = _conversation.Executor.Context.NativeHandle;
-            var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>());
-            _sampler.Accept(ctx, token);
-            _decoder.Add(token);
-
-            // Prompt the conversation with this token, to continue generating from there
-            _conversation.Prompt(token);
-        }
-
-        public void Split()
-        {
-            if (_conversation != null)
-            {
-                _left = new Node(_conversation.Fork());
-                _right = new Node(_conversation.Fork());
-
-                _conversation.Dispose();
-                _conversation = null;
-            }
-            else
-            {
-                _left?.Split();
-                _right?.Split();
-            }
-        }
-
-        public void Display<T>(T tree, int depth = 0)
-            where T : IHasTreeNodes
-        {
-            var colors = new[] { "red", "green", "blue", "yellow", "white" };
-            var color = colors[depth % colors.Length];
-
-            var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));
-
-            var n = tree.AddNode($"[{color}]{message}[/]");
-
-            _left?.Display(n, depth + 1);
-            _right?.Display(n, depth + 1);
-        }
-    }
-}
-```
\ No newline at end of file
+# BatchedExecutor Fork - Generate Multiple Completions With Shared Memory
+
+This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs).
+
+Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one.
+
+An example output, starting with the prompt `Not many people know that`:
+
+```
+Not many people know that
+└── , in the 17th century, a military band led by Captain Charles
+    ├──  Bossler of Baden, Germany, composed and played a music suite titled
+    │   ├──  the "Civil Psalm," in order to rally German Protestants during
+    │   │   ├──  the Thirty Years' War.  This tune became popular among German soldiers,
+    │   │   │   ├──  and its popularity continued long after the war
+    │   │   │   └──  and, eventually, reached France. The
+    │   │   └──  the Thirty Years' War.This music, with its clear call
+    │   │       ├──  to arms and strong Christian themes, helped
+    │   │       └──  to arms and unwavering belief
+    │   └──  "Baden's First National Symphony," with lyrics by a young Wol
+    │       ├── fgang Amadeus Mozart. The story of the composition's creation
+    │       │   ├──  has long been forgotten. But the B
+    │       │   └──  was popularized by a novelty book
+    │       └── fgang Amadeus Mozart. It's said that this music brought
+    │           ├──  peace to Europe, at least for a
+    │           └──  the troops together during difficult times. It
+    └──  Newdick played a mournful dirge to accompany the procession of
+        ├──  the head of King Charles I. It is the scene that opens my latest book
+        │   ├── , "Death and Taxes." The book follows a British army captain named
+        │   │   ├──  Marcus as he seeks revenge for his wife
+        │   │   └──  William Darnay who becomes involved in
+        │   └── , A King, A Pawn and a Prince. The murder of the king
+        │       ├──  and the civil war that followed are the
+        │       └──  is a watershed moment in the political
+        └──  the coffin of William Shakespeare, as it was carried to its final resting place
+            ├── . That is the least that can be said for a man who is often regarded
+            │   ├──  as the greatest writer in the English language
+            │   └──  as the greatest writer the English language has
+            └──  at Stratford-upon-Avon.  Shakespeare, of course
+                ├── , was a famous English poet and play
+                └── , was one of the greatest playwright
+```
+
+Forked sequences can be used for many possible things. For example
+ - Evaluating the system prompt once and forking for each independent conversation.
+ - Saving a "checkpoint" in a conversation to return to later.
+ - Beam Search.
+ - Splitting a conversation, generating completions from several different "agents", and taking the best response.
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorGuidance.md b/docs/Examples/BatchedExecutorGuidance.md
index 94d0ef867..99912ae40 100644
--- a/docs/Examples/BatchedExecutorGuidance.md
+++ b/docs/Examples/BatchedExecutorGuidance.md
@@ -1,130 +1,7 @@
-# Batched executor - basic guidance
+# BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting
 
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
+This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs).
 
-namespace LLama.Examples.Examples;
+Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red.
 
-/// <summary>
-/// This demonstrates using a batch to generate two sequences and then using one
-/// sequence as the negative guidance ("classifier free guidance") for the other.
-/// </summary>
-public class BatchedExecutorGuidance
-{
-    private const int n_len = 32;
-
-    public static async Task Run()
-    {
-        string modelPath = UserSettings.GetModelPath();
-
-        var parameters = new ModelParams(modelPath);
-        using var model = LLamaWeights.LoadFromFile(parameters);
-
-        var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
-        var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
-        var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);
-
-        // Create an executor that can evaluate a batch of conversations together
-        using var executor = new BatchedExecutor(model, parameters);
-
-        // Print some info
-        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
-        Console.WriteLine($"Created executor with model: {name}");
-
-        // Load the two prompts into two conversations
-        using var guided = executor.Create();
-        guided.Prompt(positivePrompt);
-        using var guidance = executor.Create();
-        guidance.Prompt(negativePrompt);
-
-        // Run inference to evaluate prompts
-        await AnsiConsole
-             .Status()
-             .Spinner(Spinner.Known.Line)
-             .StartAsync("Evaluating Prompts...", _ => executor.Infer());
-
-        // Fork the "guided" conversation. We'll run this one without guidance for comparison
-        using var unguided = guided.Fork();
-
-        // Run inference loop
-        var unguidedSampler = new GuidedSampler(null, weight);
-        var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
-        var guidedSampler = new GuidedSampler(guidance, weight);
-        var guidedDecoder = new StreamingTokenDecoder(executor.Context);
-        await AnsiConsole
-           .Progress()
-           .StartAsync(async progress =>
-            {
-                var reporter = progress.AddTask("Running Inference", maxValue: n_len);
-
-                for (var i = 0; i < n_len; i++)
-                {
-                    if (i != 0)
-                        await executor.Infer();
-
-                    // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
-                    // guidance. This serves as a comparison to show the effect of guidance.
-                    var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>());
-                    unguidedDecoder.Add(u);
-                    unguided.Prompt(u);
-
-                    // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
-                    // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
-                    var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>());
-                    guidedDecoder.Add(g);
-
-                    // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
-                    guided.Prompt(g);
-                    guidance.Prompt(g);
-
-                    // Early exit if we reach the natural end of the guided sentence
-                    if (g == model.EndOfSentenceToken)
-                        break;
-
-                    // Update progress bar
-                    reporter.Increment(1);
-                }
-            });
-
-        AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
-        AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
-    }
-
-    private class GuidedSampler(Conversation? guidance, float weight)
-        : BaseSamplingPipeline
-    {
-        public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
-        {
-        }
-
-        public override ISamplingPipeline Clone()
-        {
-            throw new NotSupportedException();
-        }
-
-        protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens)
-        {
-            if (guidance == null)
-                return;
-
-            // Get the logits generated by the guidance sequences
-            var guidanceLogits = guidance.Sample();
-
-            // Use those logits to guide this sequence
-            NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
-        }
-
-        protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens)
-        {
-            candidates.Temperature(ctx, 0.8f);
-            candidates.TopK(ctx, 25);
-
-            return candidates.SampleToken(ctx);
-        }
-    }
-}
-```
\ No newline at end of file
+A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token.
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorRewind.md b/docs/Examples/BatchedExecutorRewind.md
index 06287b7c9..78c480c77 100644
--- a/docs/Examples/BatchedExecutorRewind.md
+++ b/docs/Examples/BatchedExecutorRewind.md
@@ -1,121 +1,5 @@
-# Batched executor - rewinding to an earlier state
+# BatchedExecutor - Rewind
 
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
+This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs).
 
-namespace LLama.Examples.Examples;
-
-/// <summary>
-/// This demonstrates generating tokens and then rewinding to an earlier state
-/// </summary>
-public class BatchedExecutorRewind
-{
-    private const int n_generate = 24;
-    private const int n_rewind = 12;
-    private const int n_repeats = 6;
-
-    public static async Task Run()
-    {
-        string modelPath = UserSettings.GetModelPath();
-
-        var parameters = new ModelParams(modelPath);
-        using var model = LLamaWeights.LoadFromFile(parameters);
-
-        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
-
-        // Create an executor that can evaluate a batch of conversations together
-        using var executor = new BatchedExecutor(model, parameters);
-
-        // Print some info
-        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
-        Console.WriteLine($"Created executor with model: {name}");
-
-        // Evaluate the initial prompt to create one conversation
-        using var conversation = executor.Create();
-        conversation.Prompt(prompt);
-        
-        // Create the start node wrapping the conversation
-        var node = new Node(executor.Context);
-
-        // Print the prompt
-        Console.ForegroundColor = ConsoleColor.Green;
-        Console.WriteLine(prompt);
-
-        for (var i = 0; i < n_repeats; i++)
-        {
-            for (var j = 0; j < n_generate; j++)
-            {
-                // Run inference
-                await executor.Infer();
-
-                // Sample a token
-                var token = node.Sample(conversation);
-
-                // Continue conversation with this token
-                if (j != n_generate - 1)
-                    conversation.Prompt(token);
-            }
-
-            // Write out what we generated
-            node.Write(n_rewind, i + 1);
-
-            // Rewind back a few tokens
-            conversation.Rewind(n_rewind + 1);
-
-            // Prompt with a token
-            conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));
-
-            // Create a new node around the rewound conversation
-            node = new Node(executor.Context);
-        }
-
-        Console.WriteLine("Press any key to exit demo");
-        Console.ReadKey(true);
-    }
-
-    private class Node
-    {
-        private readonly LLamaContext _context;
-
-        private readonly List<LLamaToken> _tokens = new List<LLamaToken>();
-        private readonly DefaultSamplingPipeline Sampler;
-
-        public Node(LLamaContext context)
-        {
-            _context = context;
-            Sampler = new DefaultSamplingPipeline();
-        }
-
-        public LLamaToken Sample(Conversation conversation)
-        {
-            var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>());
-            _tokens.Add(token);
-            return token;
-        }
-
-        public void Write(int n_rewind, int depth)
-        {
-            var decoder = new StreamingTokenDecoder(_context);
-
-            for (var i = 0; i < _tokens.Count - n_rewind; i++)
-                decoder.Add(_tokens[i]);
-
-            AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]");
-
-            for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
-                decoder.Add(_tokens[i]);
-
-            AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]");
-        }
-
-        public LLamaToken GetToken(int index)
-        {
-            return _tokens[index];
-        }
-    }
-}
-```
\ No newline at end of file
+A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation.
\ No newline at end of file