Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 48 additions & 148 deletions docs/Examples/BatchedExecutorFork.md
Original file line number Diff line number Diff line change
@@ -1,148 +1,48 @@
# Bacthed executor - multi-output to one input

```cs
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;

namespace LLama.Examples.Examples;

/// <summary>
/// This demonstrates generating multiple replies to the same prompt, with a shared cache
/// </summary>
public class BatchedExecutorFork
{
private const int n_split = 16;
private const int n_len = 72;

public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();

var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");

// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);

// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");

// Evaluate the initial prompt to create one conversation
using var start = executor.Create();
start.Prompt(prompt);
await executor.Infer();

// Create the root node of the tree
var root = new Node(start);

await AnsiConsole
.Progress()
.StartAsync(async progress =>
{
var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);

// Run inference loop
for (var i = 0; i < n_len; i++)
{
if (i != 0)
await executor.Infer();

// Occasionally fork all the active conversations
if (i != 0 && i % n_split == 0)
root.Split();

// Sample all active conversations
root.Sample();

// Update progress bar
reporter.Increment(1);
reporter.Description($"Running Inference ({root.ActiveConversationCount})");
}

// Display results
var display = new Tree(prompt);
root.Display(display);
AnsiConsole.Write(display);
});
}

private class Node
{
private readonly StreamingTokenDecoder _decoder;

private readonly DefaultSamplingPipeline _sampler;
private Conversation? _conversation;

private Node? _left;
private Node? _right;

public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;

public Node(Conversation conversation)
{
_sampler = new DefaultSamplingPipeline();
_conversation = conversation;
_decoder = new StreamingTokenDecoder(conversation.Executor.Context);
}

public void Sample()
{
if (_conversation == null)
{
_left?.Sample();
_right?.Sample();
return;
}

if (_conversation.RequiresInference)
return;

// Sample one token
var ctx = _conversation.Executor.Context.NativeHandle;
var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>());
_sampler.Accept(ctx, token);
_decoder.Add(token);

// Prompt the conversation with this token, to continue generating from there
_conversation.Prompt(token);
}

public void Split()
{
if (_conversation != null)
{
_left = new Node(_conversation.Fork());
_right = new Node(_conversation.Fork());

_conversation.Dispose();
_conversation = null;
}
else
{
_left?.Split();
_right?.Split();
}
}

public void Display<T>(T tree, int depth = 0)
where T : IHasTreeNodes
{
var colors = new[] { "red", "green", "blue", "yellow", "white" };
var color = colors[depth % colors.Length];

var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));

var n = tree.AddNode($"[{color}]{message}[/]");

_left?.Display(n, depth + 1);
_right?.Display(n, depth + 1);
}
}
}
```
# BatchedExecutor Fork - Generate Multiple Completions With Shared Memory

This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs).

Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one.

An example output, starting with the prompt `Not many people know that`:

```
Not many people know that
└── , in the 17th century, a military band led by Captain Charles
├── Bossler of Baden, Germany, composed and played a music suite titled
│ ├── the "Civil Psalm," in order to rally German Protestants during
│ │ ├── the Thirty Years' War. This tune became popular among German soldiers,
│ │ │ ├── and its popularity continued long after the war
│ │ │ └── and, eventually, reached France. The
│ │ └── the Thirty Years' War.This music, with its clear call
│ │ ├── to arms and strong Christian themes, helped
│ │ └── to arms and unwavering belief
│ └── "Baden's First National Symphony," with lyrics by a young Wol
│ ├── fgang Amadeus Mozart. The story of the composition's creation
│ │ ├── has long been forgotten. But the B
│ │ └── was popularized by a novelty book
│ └── fgang Amadeus Mozart. It's said that this music brought
│ ├── peace to Europe, at least for a
│ └── the troops together during difficult times. It
└── Newdick played a mournful dirge to accompany the procession of
├── the head of King Charles I. It is the scene that opens my latest book
│ ├── , "Death and Taxes." The book follows a British army captain named
│ │ ├── Marcus as he seeks revenge for his wife
│ │ └── William Darnay who becomes involved in
│ └── , A King, A Pawn and a Prince. The murder of the king
│ ├── and the civil war that followed are the
│ └── is a watershed moment in the political
└── the coffin of William Shakespeare, as it was carried to its final resting place
├── . That is the least that can be said for a man who is often regarded
│ ├── as the greatest writer in the English language
│ └── as the greatest writer the English language has
└── at Stratford-upon-Avon. Shakespeare, of course
├── , was a famous English poet and play
└── , was one of the greatest playwright
```

Forked sequences can be used for many possible things. For example
- Evaluating the system prompt once and forking for each independent conversation.
- Saving a "checkpoint" in a conversation to return to later.
- Beam Search.
- Splitting a conversation, generating completions from several different "agents", and taking the best response.
131 changes: 4 additions & 127 deletions docs/Examples/BatchedExecutorGuidance.md
Original file line number Diff line number Diff line change
@@ -1,130 +1,7 @@
# Batched executor - basic guidance
# BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting

```cs
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;
This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs).

namespace LLama.Examples.Examples;
Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red.

/// <summary>
/// This demonstrates using a batch to generate two sequences and then using one
/// sequence as the negative guidance ("classifier free guidance") for the other.
/// </summary>
public class BatchedExecutorGuidance
{
private const int n_len = 32;

public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();

var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);

// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);

// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");

// Load the two prompts into two conversations
using var guided = executor.Create();
guided.Prompt(positivePrompt);
using var guidance = executor.Create();
guidance.Prompt(negativePrompt);

// Run inference to evaluate prompts
await AnsiConsole
.Status()
.Spinner(Spinner.Known.Line)
.StartAsync("Evaluating Prompts...", _ => executor.Infer());

// Fork the "guided" conversation. We'll run this one without guidance for comparison
using var unguided = guided.Fork();

// Run inference loop
var unguidedSampler = new GuidedSampler(null, weight);
var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
var guidedSampler = new GuidedSampler(guidance, weight);
var guidedDecoder = new StreamingTokenDecoder(executor.Context);
await AnsiConsole
.Progress()
.StartAsync(async progress =>
{
var reporter = progress.AddTask("Running Inference", maxValue: n_len);

for (var i = 0; i < n_len; i++)
{
if (i != 0)
await executor.Infer();

// Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
// guidance. This serves as a comparison to show the effect of guidance.
var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>());
unguidedDecoder.Add(u);
unguided.Prompt(u);

// Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
// to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>());
guidedDecoder.Add(g);

// Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
guided.Prompt(g);
guidance.Prompt(g);

// Early exit if we reach the natural end of the guided sentence
if (g == model.EndOfSentenceToken)
break;

// Update progress bar
reporter.Increment(1);
}
});

AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
}

private class GuidedSampler(Conversation? guidance, float weight)
: BaseSamplingPipeline
{
public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
{
}

public override ISamplingPipeline Clone()
{
throw new NotSupportedException();
}

protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens)
{
if (guidance == null)
return;

// Get the logits generated by the guidance sequences
var guidanceLogits = guidance.Sample();

// Use those logits to guide this sequence
NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
}

protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens)
{
candidates.Temperature(ctx, 0.8f);
candidates.TopK(ctx, 25);

return candidates.SampleToken(ctx);
}
}
}
```
A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token.
Loading