Skip to content

Commit 5941514

Browse files
committed
Merge commit '5bf2a2771886ee86137e01dbc7492f78fb392066' into concedo_experimental
# Conflicts: # .devops/tools.sh # README.md
2 parents 8f4ed0d + 5bf2a27 commit 5941514

File tree

15 files changed

+566
-435
lines changed

15 files changed

+566
-435
lines changed

examples/common.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
236236
break;
237237
}
238238
params.mirostat_tau = std::stof(argv[i]);
239+
} else if (arg == "--cfg-negative-prompt") {
240+
if (++i >= argc) {
241+
invalid_param = true;
242+
break;
243+
}
244+
params.cfg_negative_prompt = argv[i];
245+
} else if (arg == "--cfg-scale") {
246+
if (++i >= argc) {
247+
invalid_param = true;
248+
break;
249+
}
250+
params.cfg_scale = std::stof(argv[i]);
251+
} else if (arg == "--cfg-smooth-factor") {
252+
if (++i >= argc) {
253+
invalid_param = true;
254+
break;
255+
}
256+
params.cfg_smooth_factor = std::stof(argv[i]);
239257
} else if (arg == "-b" || arg == "--batch-size") {
240258
if (++i >= argc) {
241259
invalid_param = true;
@@ -267,7 +285,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
267285
break;
268286
}
269287
params.lora_adapter = argv[i];
270-
params.use_mmap = false;
271288
} else if (arg == "--lora-base") {
272289
if (++i >= argc) {
273290
invalid_param = true;
@@ -470,6 +487,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
470487
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
471488
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
472489
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
490+
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
491+
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
492+
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
493+
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
473494
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
474495
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
475496
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
@@ -499,7 +520,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
499520
fprintf(stderr, " --mtest compute maximum memory usage\n");
500521
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
501522
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
502-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
523+
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
503524
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
504525
fprintf(stderr, " -m FNAME, --model FNAME\n");
505526
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -536,7 +557,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
536557
return res;
537558
}
538559

539-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
560+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
540561
auto lparams = llama_context_default_params();
541562

542563
lparams.n_ctx = params.n_ctx;
@@ -552,6 +573,12 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
552573
lparams.logits_all = params.perplexity;
553574
lparams.embedding = params.embedding;
554575

576+
return lparams;
577+
}
578+
579+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
580+
auto lparams = llama_context_params_from_gpt_params(params);
581+
555582
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
556583
if (model == NULL) {
557584
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());

examples/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ struct gpt_params {
4848
float mirostat_tau = 5.00f; // target entropy
4949
float mirostat_eta = 0.10f; // learning rate
5050

51+
// Classifier-Free Guidance
52+
// https://arxiv.org/abs/2306.17806
53+
std::string cfg_negative_prompt; // string to help guidance
54+
float cfg_scale = 1.f; // How strong is guidance
55+
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
56+
5157
std::string model = "models/7B/ggml-model.bin"; // model path
5258
std::string model_alias = "unknown"; // model alias
5359
std::string prompt = "";
@@ -99,6 +105,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
99105
//
100106

101107
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
108+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
102109

103110
//
104111
// Console utils

examples/main/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293293
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294294
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295295
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
296+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
297297
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,16 @@ int main(int argc, char ** argv) {
109109

110110
llama_model * model;
111111
llama_context * ctx;
112+
llama_context * ctx_guidance = NULL;
112113
g_ctx = &ctx;
113114

114115
// load the model and apply lora adapter, if any
115116
std::tie(model, ctx) = llama_init_from_gpt_params(params);
117+
if (params.cfg_scale > 1.f) {
118+
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
119+
ctx_guidance = llama_new_context_with_model(model, lparams);
120+
}
121+
116122
if (model == NULL) {
117123
fprintf(stderr, "%s: error: unable to load model\n", __func__);
118124
return 1;
@@ -183,15 +189,28 @@ int main(int argc, char ** argv) {
183189
// tokenize the prompt
184190
std::vector<llama_token> embd_inp;
185191

186-
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
187-
// Add a space in front of the first character to match OG llama tokenizer behavior
188-
params.prompt.insert(0, 1, ' ');
192+
// Add a space in front of the first character to match OG llama tokenizer behavior
193+
params.prompt.insert(0, 1, ' ');
189194

195+
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
190196
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
191197
} else {
192198
embd_inp = session_tokens;
193199
}
194200

201+
// Tokenize negative prompt
202+
std::vector<llama_token> guidance_inp;
203+
int guidance_offset = 0;
204+
int original_prompt_len = 0;
205+
if (ctx_guidance) {
206+
params.cfg_negative_prompt.insert(0, 1, ' ');
207+
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
208+
209+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
210+
original_prompt_len = original_inp.size();
211+
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
212+
}
213+
195214
const int n_ctx = llama_n_ctx(ctx);
196215

197216
if ((int) embd_inp.size() > n_ctx - 4) {
@@ -258,6 +277,16 @@ int main(int argc, char ** argv) {
258277
for (int i = 0; i < (int) embd_inp.size(); i++) {
259278
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
260279
}
280+
281+
if (ctx_guidance) {
282+
fprintf(stderr, "\n");
283+
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
284+
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
285+
for (int i = 0; i < (int) guidance_inp.size(); i++) {
286+
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
287+
}
288+
}
289+
261290
if (params.n_keep > 0) {
262291
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
263292
for (int i = 0; i < params.n_keep; i++) {
@@ -334,11 +363,13 @@ int main(int argc, char ** argv) {
334363
int n_remain = params.n_predict;
335364
int n_consumed = 0;
336365
int n_session_consumed = 0;
366+
int n_past_guidance = 0;
337367

338368
// the first thing we will do is to output the prompt, so set color accordingly
339369
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
340370

341371
std::vector<llama_token> embd;
372+
std::vector<llama_token> embd_guidance;
342373

343374
// do one empty run to warm up the model
344375
{
@@ -367,11 +398,12 @@ int main(int argc, char ** argv) {
367398
// if we run out of context:
368399
// - take the n_keep first tokens from the original prompt (via n_past)
369400
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
370-
if (n_past + (int) embd.size() > n_ctx) {
401+
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
371402
const int n_left = n_past - params.n_keep;
372403

373404
// always keep the first token - BOS
374405
n_past = std::max(1, params.n_keep);
406+
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
375407

376408
// insert n_left/2 tokens at the start of embd from last_n_tokens
377409
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -412,6 +444,48 @@ int main(int argc, char ** argv) {
412444

413445
// evaluate tokens in batches
414446
// embd is typically prepared beforehand to fit within a batch, but not always
447+
448+
if (ctx_guidance) {
449+
int input_size = 0;
450+
llama_token* input_buf = NULL;
451+
452+
if (n_past_guidance < (int) guidance_inp.size()) {
453+
// Guidance context should have the same data with these modifications:
454+
//
455+
// * Replace the initial prompt
456+
// * Shift everything by guidance_offset
457+
embd_guidance = guidance_inp;
458+
if (embd.begin() + original_prompt_len < embd.end()) {
459+
embd_guidance.insert(
460+
embd_guidance.end(),
461+
embd.begin() + original_prompt_len,
462+
embd.end()
463+
);
464+
}
465+
466+
input_buf = embd_guidance.data();
467+
input_size = embd_guidance.size();
468+
//fprintf(stderr, "\n---------------------\n");
469+
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
470+
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
471+
//}
472+
//fprintf(stderr, "\n---------------------\n");
473+
} else {
474+
input_buf = embd.data();
475+
input_size = embd.size();
476+
}
477+
478+
for (int i = 0; i < input_size; i += params.n_batch) {
479+
int n_eval = std::min(input_size - i, params.n_batch);
480+
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
481+
fprintf(stderr, "%s : failed to eval\n", __func__);
482+
return 1;
483+
}
484+
485+
n_past_guidance += n_eval;
486+
}
487+
}
488+
415489
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
416490
int n_eval = (int) embd.size() - i;
417491
if (n_eval > params.n_batch) {
@@ -431,6 +505,7 @@ int main(int argc, char ** argv) {
431505
}
432506

433507
embd.clear();
508+
embd_guidance.clear();
434509

435510
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
436511
// out of user input, sample next token
@@ -473,6 +548,10 @@ int main(int argc, char ** argv) {
473548

474549
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
475550

551+
if (ctx_guidance) {
552+
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
553+
}
554+
476555
// Apply penalties
477556
float nl_logit = logits[llama_token_nl()];
478557
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -668,6 +747,7 @@ int main(int argc, char ** argv) {
668747
}
669748

670749
llama_print_timings(ctx);
750+
if (ctx_guidance) { llama_free(ctx_guidance); }
671751
llama_free(ctx);
672752
llama_free_model(model);
673753

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Command line options:
1616
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
1717
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
1818
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
19+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
2020
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.

examples/server/server.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
632632
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633633
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634634
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
635+
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
636636
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637637
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638638
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
820820
break;
821821
}
822822
params.lora_adapter = argv[i];
823-
params.use_mmap = false;
824823
}
825824
else if (arg == "--lora-base")
826825
{

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
13541354
}
13551355
}
13561356

1357-
if (t->src0) {
1358-
expand(g, t->src0);
1359-
}
1360-
1361-
if (t->src1) {
1362-
expand(g, t->src1);
1363-
}
1364-
1365-
for (int i = 0; i < GGML_MAX_OPT; ++i) {
1366-
if (t->opt[i]) {
1367-
expand(g, t->opt[i]);
1357+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
1358+
if (t->src[i]) {
1359+
expand(g, t->src[i]);
13681360
}
13691361
}
13701362

0 commit comments

Comments
 (0)