From 5bbf90d0bc86c68bc4c6139d430e5a20fa3fd5d4 Mon Sep 17 00:00:00 2001 From: Andy Date: Sat, 23 Aug 2025 16:26:25 +0800 Subject: [PATCH 1/4] Add split model loading support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces comprehensive support for loading models from multiple split files: - Added `load_from_splits()` method to LlamaModel for loading models split across multiple files - Added utility functions `split_path()` and `split_prefix()` for working with split file naming conventions - Added split_model example demonstrating usage of the split loading functionality - Updated workspace Cargo.toml to include the new split_model example This feature enables loading very large models that have been split due to filesystem limitations or distribution requirements. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- Cargo.toml | 2 + examples/split_model/Cargo.toml | 9 ++ examples/split_model/src/main.rs | 195 +++++++++++++++++++++++++++++++ llama-cpp-2/src/model.rs | 168 ++++++++++++++++++++++++++ 4 files changed, 374 insertions(+) create mode 100644 examples/split_model/Cargo.toml create mode 100644 examples/split_model/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 8a5835fc..011bc34e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,8 @@ members = [ "examples/simple", "examples/reranker", "examples/mtmd", + "examples/split_model", + "examples/rpc", ] [workspace.dependencies] diff --git a/examples/split_model/Cargo.toml b/examples/split_model/Cargo.toml new file mode 100644 index 00000000..e366545a --- /dev/null +++ b/examples/split_model/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "split_model" +version = "0.1.0" +edition = "2021" + +[dependencies] +llama-cpp-2 = { path = "../../llama-cpp-2" } +anyhow = "1.0" +clap = { version = "4", features = ["derive"] } \ No newline at end of file diff --git a/examples/split_model/src/main.rs b/examples/split_model/src/main.rs new file mode 100644 index 00000000..92f141b7 --- /dev/null +++ b/examples/split_model/src/main.rs @@ -0,0 +1,195 @@ +//! Example demonstrating how to load split GGUF models. +//! +//! This example shows how to: +//! - Load a model split across multiple files +//! - Use utility functions to work with split file naming conventions +//! - Generate text from a split model + +use anyhow::Result; +use clap::Parser; +use llama_cpp_2::{ + context::params::LlamaContextParams, + llama_backend::LlamaBackend, + llama_batch::LlamaBatch, + model::{params::LlamaModelParams, AddBos, LlamaModel}, + sampling::LlamaSampler, +}; +use std::io::{self, Write}; +use std::num::NonZeroU32; +use std::path::{Path, PathBuf}; + +/// Command line arguments for the split model example +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Paths to the split model files (can be specified multiple times) + #[arg(short = 'm', long = "model", required = true, num_args = 1..)] + model_paths: Vec, + + /// Alternatively, provide a prefix and the program will auto-detect splits + #[arg(short = 'p', long = "prefix", conflicts_with = "model_paths")] + prefix: Option, + + /// Number of splits (required if using --prefix) + #[arg(short = 'n', long = "num-splits", requires = "prefix")] + num_splits: Option, + + /// Prompt to use for generation + #[arg(short = 't', long = "prompt", default_value = "Once upon a time")] + prompt: String, + + /// Number of tokens to generate + #[arg(short = 'g', long = "n-predict", default_value_t = 128)] + n_predict: i32, + + /// Number of GPU layers + #[arg(short = 'l', long = "n-gpu-layers", default_value_t = 0)] + n_gpu_layers: u32, + + /// Context size + #[arg(short = 'c', long = "ctx-size", default_value_t = 2048)] + ctx_size: u32, + + /// Temperature for sampling + #[arg(long = "temp", default_value_t = 0.8)] + temperature: f32, + + /// Top-P for sampling + #[arg(long = "top-p", default_value_t = 0.95)] + top_p: f32, + + /// Seed for random number generation + #[arg(long = "seed", default_value_t = 1234)] + seed: u32, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + // Determine the model paths + let model_paths = if let Some(prefix) = args.prefix { + let num_splits = args.num_splits.expect("num-splits required with prefix"); + + // Generate split paths using the utility function + let mut paths = Vec::new(); + for i in 1..=num_splits { + let path = LlamaModel::split_path(&prefix, i as i32, num_splits as i32); + paths.push(PathBuf::from(path)); + } + + println!("Generated split paths:"); + for path in &paths { + println!(" - {}", path.display()); + } + + paths + } else { + args.model_paths + }; + + // Verify all split files exist + for path in &model_paths { + if !path.exists() { + eprintln!("Error: Split file not found: {}", path.display()); + std::process::exit(1); + } + } + + println!("Loading model from {} splits...", model_paths.len()); + + // Initialize the backend + let backend = LlamaBackend::init()?; + + // Set up model parameters + let mut model_params = LlamaModelParams::default(); + if args.n_gpu_layers > 0 { + model_params = model_params.with_n_gpu_layers(args.n_gpu_layers); + } + + // Load the model from splits + let model = LlamaModel::load_from_splits(&backend, &model_paths, &model_params)?; + println!("Model loaded successfully!"); + + // Get model info + let n_vocab = model.n_vocab(); + println!("Model vocabulary size: {}", n_vocab); + + // Create context + let ctx_params = LlamaContextParams::default() + .with_n_ctx(Some(NonZeroU32::new(args.ctx_size).unwrap())); + + let mut ctx = model.new_context(&backend, ctx_params)?; + println!("Context created with size: {}", args.ctx_size); + + // Tokenize the prompt + let tokens = model.str_to_token(&args.prompt, AddBos::Always)?; + println!("Prompt tokenized into {} tokens", tokens.len()); + + // Create batch + let mut batch = LlamaBatch::new(512, 1); + + // Add tokens to batch + let last_index = tokens.len() - 1; + for (i, token) in tokens.iter().enumerate() { + let is_last = i == last_index; + batch.add(*token, i as i32, &[0], is_last)?; + } + + // Decode the batch + ctx.decode(&mut batch)?; + println!("Initial prompt processed"); + + // Set up sampling + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::temp(args.temperature), + LlamaSampler::top_p(args.top_p, 1), + ]); + + // Generate text + print!("{}", args.prompt); + io::stdout().flush()?; + + let mut n_cur = batch.n_tokens(); + let mut n_decode = 0; + + while n_decode < args.n_predict { + // Sample the next token + let new_token = sampler.sample(&ctx, batch.n_tokens() - 1); + sampler.accept(new_token); + + // Check for EOS + if model.is_eog_token(new_token) { + println!(); + break; + } + + // Print the token + let piece = model.token_to_str(new_token, llama_cpp_2::model::Special::Tokenize)?; + print!("{}", piece); + io::stdout().flush()?; + + // Prepare the next batch + batch.clear(); + batch.add(new_token, n_cur, &[0], true)?; + n_cur += 1; + + // Decode + ctx.decode(&mut batch)?; + n_decode += 1; + } + + println!("\n\nGeneration complete!"); + println!("Generated {} tokens", n_decode); + + // Demonstrate the split_prefix utility + if let Some(first_path) = model_paths.first() { + if let Some(path_str) = first_path.to_str() { + // Try to extract the prefix from the first split file + if let Some(prefix) = LlamaModel::split_prefix(path_str, 1, model_paths.len() as i32) { + println!("\nExtracted prefix from first split: {}", prefix); + } + } + } + + Ok(()) +} \ No newline at end of file diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index d2df9990..3490b466 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -622,6 +622,174 @@ impl LlamaModel { Ok(LlamaModel { model }) } + /// Load a model from multiple split files. + /// + /// This function loads a model that has been split across multiple files. This is useful for + /// very large models that exceed filesystem limitations or need to be distributed across + /// multiple storage devices. + /// + /// # Arguments + /// + /// * `paths` - A slice of paths to the split model files + /// * `params` - The model parameters + /// + /// # Errors + /// + /// Returns an error if: + /// - Any of the paths cannot be converted to a C string + /// - The model fails to load from the splits + /// - Any path doesn't exist or isn't accessible + /// + /// # Example + /// + /// ```no_run + /// use llama_cpp_2::model::{LlamaModel, params::LlamaModelParams}; + /// use llama_cpp_2::llama_backend::LlamaBackend; + /// use std::path::Path; + /// + /// # fn main() -> Result<(), Box> { + /// let backend = LlamaBackend::init()?; + /// let params = LlamaModelParams::default(); + /// + /// let paths = vec![ + /// Path::new("model-00001-of-00003.gguf"), + /// Path::new("model-00002-of-00003.gguf"), + /// Path::new("model-00003-of-00003.gguf"), + /// ]; + /// + /// let model = LlamaModel::load_from_splits(&backend, &paths, ¶ms)?; + /// # Ok(()) + /// # } + /// ``` + #[tracing::instrument(skip_all)] + pub fn load_from_splits( + _: &LlamaBackend, + paths: &[impl AsRef], + params: &LlamaModelParams, + ) -> Result { + // Convert paths to C strings + let c_strings: Vec = paths + .iter() + .map(|p| { + let path = p.as_ref(); + debug_assert!(path.exists(), "{path:?} does not exist"); + let path_str = path + .to_str() + .ok_or(LlamaModelLoadError::PathToStrError(path.to_path_buf()))?; + CString::new(path_str).map_err(LlamaModelLoadError::from) + }) + .collect::, _>>()?; + + // Create array of pointers to C strings + let c_ptrs: Vec<*const c_char> = c_strings.iter().map(|s| s.as_ptr()).collect(); + + // Load the model from splits + let llama_model = unsafe { + llama_cpp_sys_2::llama_model_load_from_splits( + c_ptrs.as_ptr() as *mut *const c_char, + c_ptrs.len(), + params.params, + ) + }; + + let model = NonNull::new(llama_model).ok_or(LlamaModelLoadError::NullResult)?; + + tracing::debug!("Loaded model from {} splits", paths.len()); + Ok(LlamaModel { model }) + } + + /// Build a split GGUF file path for a specific chunk. + /// + /// This utility function creates the standardized filename for a split model chunk + /// following the pattern: `{prefix}-{split_no:05d}-of-{split_count:05d}.gguf` + /// + /// # Arguments + /// + /// * `path_prefix` - The base path and filename prefix + /// * `split_no` - The split number (1-indexed) + /// * `split_count` - The total number of splits + /// + /// # Returns + /// + /// Returns the formatted split path as a String + /// + /// # Example + /// + /// ``` + /// use llama_cpp_2::model::LlamaModel; + /// + /// let path = LlamaModel::split_path("/models/llama", 2, 4); + /// assert_eq!(path, "/models/llama-00002-of-00004.gguf"); + /// ``` + pub fn split_path(path_prefix: &str, split_no: i32, split_count: i32) -> String { + let mut buffer = vec![0u8; 1024]; + let path_prefix_cstr = CString::new(path_prefix).unwrap_or_else(|_| CString::new("").unwrap()); + let len = unsafe { + llama_cpp_sys_2::llama_split_path( + buffer.as_mut_ptr() as *mut c_char, + buffer.len(), + path_prefix_cstr.as_ptr(), + split_no, + split_count, + ) + }; + + if len > 0 && len < buffer.len() as i32 { + buffer.truncate(len as usize); + String::from_utf8(buffer).unwrap_or_else(|_| String::new()) + } else { + String::new() + } + } + + /// Extract the path prefix from a split filename. + /// + /// This function extracts the base path prefix from a split model filename, + /// but only if the split_no and split_count match the pattern in the filename. + /// + /// # Arguments + /// + /// * `split_path` - The full path to a split file + /// * `split_no` - The expected split number + /// * `split_count` - The expected total number of splits + /// + /// # Returns + /// + /// Returns `Some(prefix)` if the split pattern matches, `None` otherwise + /// + /// # Example + /// + /// ``` + /// use llama_cpp_2::model::LlamaModel; + /// + /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 2, 4); + /// assert_eq!(prefix, Some("/models/llama".to_string())); + /// + /// // Returns None if the pattern doesn't match + /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 3, 4); + /// assert_eq!(prefix, None); + /// ``` + pub fn split_prefix(split_path: &str, split_no: i32, split_count: i32) -> Option { + let mut buffer = vec![0u8; 1024]; + let split_path_cstr = CString::new(split_path).ok()?; + let len = unsafe { + llama_cpp_sys_2::llama_split_prefix( + buffer.as_mut_ptr() as *mut c_char, + buffer.len(), + split_path_cstr.as_ptr(), + split_no, + split_count, + ) + }; + + if len > 0 && len < buffer.len() as i32 { + buffer.truncate(len as usize); + String::from_utf8(buffer).ok() + } else { + None + } + } + /// Initializes a lora adapter from a file. /// /// # Errors From b858f12d446cfe59f2da569877c64b9d6d28f12b Mon Sep 17 00:00:00 2001 From: Andy Date: Sat, 23 Aug 2025 16:58:29 +0800 Subject: [PATCH 2/4] fix: Remove unused import and clean up workspace members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused Path import from split_model example - Remove RPC example from workspace members on split-model-loading branch 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- Cargo.lock | 9 +++++++++ Cargo.toml | 1 - examples/split_model/src/main.rs | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4decebc7..0c9794c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1121,6 +1121,15 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "split_model" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "llama-cpp-2", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" diff --git a/Cargo.toml b/Cargo.toml index 011bc34e..3d5feb51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,6 @@ members = [ "examples/reranker", "examples/mtmd", "examples/split_model", - "examples/rpc", ] [workspace.dependencies] diff --git a/examples/split_model/src/main.rs b/examples/split_model/src/main.rs index 92f141b7..324c3a4a 100644 --- a/examples/split_model/src/main.rs +++ b/examples/split_model/src/main.rs @@ -16,7 +16,7 @@ use llama_cpp_2::{ }; use std::io::{self, Write}; use std::num::NonZeroU32; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; /// Command line arguments for the split model example #[derive(Parser, Debug)] From 8f3cb9b9b0ea0f8cdd0fd9399cfb4b4feac1cf93 Mon Sep 17 00:00:00 2001 From: Andy Date: Sat, 23 Aug 2025 17:06:29 +0800 Subject: [PATCH 3/4] docs: Add missing RopeType variant documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added documentation comments for RopeType enum variants - Ensures all public APIs are properly documented 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- llama-cpp-2/src/model.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index 3490b466..ee85d066 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -95,9 +95,13 @@ impl LlamaChatMessage { /// The Rope type that's used within the model. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RopeType { + /// Normal RoPE (Rotary Position Embedding) Norm, + /// GPT-NeoX style RoPE NeoX, + /// Multi-resolution RoPE MRope, + /// Vision model RoPE Vision, } From a0d8be5f02276af9a781e0e7efcbf315d3e19cd4 Mon Sep 17 00:00:00 2001 From: Andy Date: Sat, 23 Aug 2025 18:10:44 +0800 Subject: [PATCH 4/4] feat(llama-cpp-sys-2): add dynamic tools CMakeLists.txt generation for split-model-loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds the scalable dynamic tools building system to the split-model-loading branch: - Adds generate_tools_cmake() function to dynamically create tools/CMakeLists.txt - Only builds tools for enabled features (solving PR #806 issue) - Split model loading doesn't require tools but maintains architecture consistency - Includes tools/CMakeLists.txt in Cargo.toml for build system compatibility - Uses feature-based conditional compilation for future extensibility This creates a merge-friendly architecture where each feature branch can extend tool building without conflicts. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- llama-cpp-sys-2/Cargo.toml | 1 + llama-cpp-sys-2/build.rs | 61 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 0a52aba9..96c4bfb8 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -51,6 +51,7 @@ include = [ "/llama.cpp/ggml/CMakeLists.txt", "/llama.cpp/ggml/src/CMakeLists.txt", "/llama.cpp/src/CMakeLists.txt", + "/llama.cpp/tools/CMakeLists.txt", "/llama.cpp/cmake", "/llama.cpp/ggml/cmake", diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 08fcd49b..2110ab33 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,6 +1,7 @@ use cmake::Config; use glob::glob; use std::env; +use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; use walkdir::DirEntry; @@ -197,6 +198,52 @@ fn is_hidden(e: &DirEntry) -> bool { .unwrap_or_default() } +/// Generate a dynamic tools CMakeLists.txt based on enabled features +/// This approach allows each feature branch to add their own tool without conflicts +fn generate_tools_cmake() { + let mut cmake_content = String::from( +r#"# Auto-generated tools CMakeLists.txt based on enabled features +# This file is created dynamically to only build tools for enabled features + +# dependencies +find_package(Threads REQUIRED) + +# third-party +# ... + +# flags +llama_add_compile_flags() + +# tools - only build what's needed based on enabled features +if (NOT EMSCRIPTEN) +"#); + + // Add tools based on enabled features + if cfg!(feature = "mtmd") { + cmake_content.push_str(" add_subdirectory(mtmd)\n"); + } + + // Future feature branches can add their tools here: + // if cfg!(feature = "rpc") { + // cmake_content.push_str(" add_subdirectory(rpc)\n"); + // } + // if cfg!(feature = "server") { + // cmake_content.push_str(" add_subdirectory(server)\n"); + // } + // if cfg!(feature = "quantize") { + // cmake_content.push_str(" add_subdirectory(quantize)\n"); + // } + + // Split model loading doesn't need any tools - it's just a library feature + + cmake_content.push_str("endif()\n"); + + // Write the generated CMakeLists.txt + let tools_cmake_path = Path::new("llama.cpp/tools/CMakeLists.txt"); + fs::write(tools_cmake_path, cmake_content) + .expect("Failed to write generated tools CMakeLists.txt"); +} + fn main() { println!("cargo:rerun-if-changed=build.rs"); @@ -448,10 +495,20 @@ fn main() { config.define("LLAMA_BUILD_TOOLS", "OFF"); config.define("LLAMA_CURL", "OFF"); - if cfg!(feature = "mtmd") { + // Generate dynamic tools CMakeLists.txt based on enabled tool features + let any_tool_features = cfg!(feature = "mtmd") + // Future tool features can be added here by other branches: + // || cfg!(feature = "rpc") + // || cfg!(feature = "server") + // || cfg!(feature = "quantize") + ; + + if any_tool_features { config.define("LLAMA_BUILD_COMMON", "ON"); - // mtmd support in llama-cpp is within the tools directory config.define("LLAMA_BUILD_TOOLS", "ON"); + + // Generate the tools CMakeLists.txt with only enabled features + generate_tools_cmake(); } // Pass CMAKE_ environment variables down to CMake