From 5bbf90d0bc86c68bc4c6139d430e5a20fa3fd5d4 Mon Sep 17 00:00:00 2001
From: Andy <andy+github@savage.hk>
Date: Sat, 23 Aug 2025 16:26:25 +0800
Subject: [PATCH 1/4] Add split model loading support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit introduces comprehensive support for loading models from multiple split files:

- Added `load_from_splits()` method to LlamaModel for loading models split across multiple files
- Added utility functions `split_path()` and `split_prefix()` for working with split file naming conventions
- Added split_model example demonstrating usage of the split loading functionality
- Updated workspace Cargo.toml to include the new split_model example

This feature enables loading very large models that have been split due to filesystem
limitations or distribution requirements.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 Cargo.toml                       |   2 +
 examples/split_model/Cargo.toml  |   9 ++
 examples/split_model/src/main.rs | 195 +++++++++++++++++++++++++++++++
 llama-cpp-2/src/model.rs         | 168 ++++++++++++++++++++++++++
 4 files changed, 374 insertions(+)
 create mode 100644 examples/split_model/Cargo.toml
 create mode 100644 examples/split_model/src/main.rs
diff --git a/Cargo.toml b/Cargo.toml
index 8a5835fc..011bc34e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,8 @@ members = [
   "examples/simple",
   "examples/reranker",
   "examples/mtmd",
+  "examples/split_model",
+  "examples/rpc",
 ]
 
 [workspace.dependencies]
diff --git a/examples/split_model/Cargo.toml b/examples/split_model/Cargo.toml
new file mode 100644
index 00000000..e366545a
--- /dev/null
+++ b/examples/split_model/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "split_model"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+llama-cpp-2 = { path = "../../llama-cpp-2" }
+anyhow = "1.0"
+clap = { version = "4", features = ["derive"] }
\ No newline at end of file
diff --git a/examples/split_model/src/main.rs b/examples/split_model/src/main.rs
new file mode 100644
index 00000000..92f141b7
--- /dev/null
+++ b/examples/split_model/src/main.rs
@@ -0,0 +1,195 @@
+//! Example demonstrating how to load split GGUF models.
+//!
+//! This example shows how to:
+//! - Load a model split across multiple files
+//! - Use utility functions to work with split file naming conventions
+//! - Generate text from a split model
+
+use anyhow::Result;
+use clap::Parser;
+use llama_cpp_2::{
+    context::params::LlamaContextParams,
+    llama_backend::LlamaBackend,
+    llama_batch::LlamaBatch,
+    model::{params::LlamaModelParams, AddBos, LlamaModel},
+    sampling::LlamaSampler,
+};
+use std::io::{self, Write};
+use std::num::NonZeroU32;
+use std::path::{Path, PathBuf};
+
+/// Command line arguments for the split model example
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Paths to the split model files (can be specified multiple times)
+    #[arg(short = 'm', long = "model", required = true, num_args = 1..)]
+    model_paths: Vec<PathBuf>,
+
+    /// Alternatively, provide a prefix and the program will auto-detect splits
+    #[arg(short = 'p', long = "prefix", conflicts_with = "model_paths")]
+    prefix: Option<String>,
+
+    /// Number of splits (required if using --prefix)
+    #[arg(short = 'n', long = "num-splits", requires = "prefix")]
+    num_splits: Option<u32>,
+
+    /// Prompt to use for generation
+    #[arg(short = 't', long = "prompt", default_value = "Once upon a time")]
+    prompt: String,
+
+    /// Number of tokens to generate
+    #[arg(short = 'g', long = "n-predict", default_value_t = 128)]
+    n_predict: i32,
+
+    /// Number of GPU layers
+    #[arg(short = 'l', long = "n-gpu-layers", default_value_t = 0)]
+    n_gpu_layers: u32,
+
+    /// Context size
+    #[arg(short = 'c', long = "ctx-size", default_value_t = 2048)]
+    ctx_size: u32,
+
+    /// Temperature for sampling
+    #[arg(long = "temp", default_value_t = 0.8)]
+    temperature: f32,
+
+    /// Top-P for sampling
+    #[arg(long = "top-p", default_value_t = 0.95)]
+    top_p: f32,
+
+    /// Seed for random number generation
+    #[arg(long = "seed", default_value_t = 1234)]
+    seed: u32,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    // Determine the model paths
+    let model_paths = if let Some(prefix) = args.prefix {
+        let num_splits = args.num_splits.expect("num-splits required with prefix");
+        
+        // Generate split paths using the utility function
+        let mut paths = Vec::new();
+        for i in 1..=num_splits {
+            let path = LlamaModel::split_path(&prefix, i as i32, num_splits as i32);
+            paths.push(PathBuf::from(path));
+        }
+        
+        println!("Generated split paths:");
+        for path in &paths {
+            println!("  - {}", path.display());
+        }
+        
+        paths
+    } else {
+        args.model_paths
+    };
+
+    // Verify all split files exist
+    for path in &model_paths {
+        if !path.exists() {
+            eprintln!("Error: Split file not found: {}", path.display());
+            std::process::exit(1);
+        }
+    }
+
+    println!("Loading model from {} splits...", model_paths.len());
+
+    // Initialize the backend
+    let backend = LlamaBackend::init()?;
+
+    // Set up model parameters
+    let mut model_params = LlamaModelParams::default();
+    if args.n_gpu_layers > 0 {
+        model_params = model_params.with_n_gpu_layers(args.n_gpu_layers);
+    }
+
+    // Load the model from splits
+    let model = LlamaModel::load_from_splits(&backend, &model_paths, &model_params)?;
+    println!("Model loaded successfully!");
+
+    // Get model info
+    let n_vocab = model.n_vocab();
+    println!("Model vocabulary size: {}", n_vocab);
+
+    // Create context
+    let ctx_params = LlamaContextParams::default()
+        .with_n_ctx(Some(NonZeroU32::new(args.ctx_size).unwrap()));
+
+    let mut ctx = model.new_context(&backend, ctx_params)?;
+    println!("Context created with size: {}", args.ctx_size);
+
+    // Tokenize the prompt
+    let tokens = model.str_to_token(&args.prompt, AddBos::Always)?;
+    println!("Prompt tokenized into {} tokens", tokens.len());
+
+    // Create batch
+    let mut batch = LlamaBatch::new(512, 1);
+
+    // Add tokens to batch
+    let last_index = tokens.len() - 1;
+    for (i, token) in tokens.iter().enumerate() {
+        let is_last = i == last_index;
+        batch.add(*token, i as i32, &[0], is_last)?;
+    }
+
+    // Decode the batch
+    ctx.decode(&mut batch)?;
+    println!("Initial prompt processed");
+
+    // Set up sampling
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::temp(args.temperature),
+        LlamaSampler::top_p(args.top_p, 1),
+    ]);
+
+    // Generate text
+    print!("{}", args.prompt);
+    io::stdout().flush()?;
+
+    let mut n_cur = batch.n_tokens();
+    let mut n_decode = 0;
+
+    while n_decode < args.n_predict {
+        // Sample the next token
+        let new_token = sampler.sample(&ctx, batch.n_tokens() - 1);
+        sampler.accept(new_token);
+
+        // Check for EOS
+        if model.is_eog_token(new_token) {
+            println!();
+            break;
+        }
+
+        // Print the token
+        let piece = model.token_to_str(new_token, llama_cpp_2::model::Special::Tokenize)?;
+        print!("{}", piece);
+        io::stdout().flush()?;
+
+        // Prepare the next batch
+        batch.clear();
+        batch.add(new_token, n_cur, &[0], true)?;
+        n_cur += 1;
+
+        // Decode
+        ctx.decode(&mut batch)?;
+        n_decode += 1;
+    }
+
+    println!("\n\nGeneration complete!");
+    println!("Generated {} tokens", n_decode);
+
+    // Demonstrate the split_prefix utility
+    if let Some(first_path) = model_paths.first() {
+        if let Some(path_str) = first_path.to_str() {
+            // Try to extract the prefix from the first split file
+            if let Some(prefix) = LlamaModel::split_prefix(path_str, 1, model_paths.len() as i32) {
+                println!("\nExtracted prefix from first split: {}", prefix);
+            }
+        }
+    }
+
+    Ok(())
+}
\ No newline at end of file
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index d2df9990..3490b466 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -622,6 +622,174 @@ impl LlamaModel {
         Ok(LlamaModel { model })
     }
 
+    /// Load a model from multiple split files.
+    ///
+    /// This function loads a model that has been split across multiple files. This is useful for
+    /// very large models that exceed filesystem limitations or need to be distributed across
+    /// multiple storage devices.
+    ///
+    /// # Arguments
+    ///
+    /// * `paths` - A slice of paths to the split model files
+    /// * `params` - The model parameters
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - Any of the paths cannot be converted to a C string
+    /// - The model fails to load from the splits
+    /// - Any path doesn't exist or isn't accessible
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use llama_cpp_2::model::{LlamaModel, params::LlamaModelParams};
+    /// use llama_cpp_2::llama_backend::LlamaBackend;
+    /// use std::path::Path;
+    ///
+    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+    /// let backend = LlamaBackend::init()?;
+    /// let params = LlamaModelParams::default();
+    /// 
+    /// let paths = vec![
+    ///     Path::new("model-00001-of-00003.gguf"),
+    ///     Path::new("model-00002-of-00003.gguf"),
+    ///     Path::new("model-00003-of-00003.gguf"),
+    /// ];
+    /// 
+    /// let model = LlamaModel::load_from_splits(&backend, &paths, &params)?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[tracing::instrument(skip_all)]
+    pub fn load_from_splits(
+        _: &LlamaBackend,
+        paths: &[impl AsRef<Path>],
+        params: &LlamaModelParams,
+    ) -> Result<Self, LlamaModelLoadError> {
+        // Convert paths to C strings
+        let c_strings: Vec<CString> = paths
+            .iter()
+            .map(|p| {
+                let path = p.as_ref();
+                debug_assert!(path.exists(), "{path:?} does not exist");
+                let path_str = path
+                    .to_str()
+                    .ok_or(LlamaModelLoadError::PathToStrError(path.to_path_buf()))?;
+                CString::new(path_str).map_err(LlamaModelLoadError::from)
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Create array of pointers to C strings
+        let c_ptrs: Vec<*const c_char> = c_strings.iter().map(|s| s.as_ptr()).collect();
+
+        // Load the model from splits
+        let llama_model = unsafe {
+            llama_cpp_sys_2::llama_model_load_from_splits(
+                c_ptrs.as_ptr() as *mut *const c_char,
+                c_ptrs.len(),
+                params.params,
+            )
+        };
+
+        let model = NonNull::new(llama_model).ok_or(LlamaModelLoadError::NullResult)?;
+
+        tracing::debug!("Loaded model from {} splits", paths.len());
+        Ok(LlamaModel { model })
+    }
+
+    /// Build a split GGUF file path for a specific chunk.
+    ///
+    /// This utility function creates the standardized filename for a split model chunk
+    /// following the pattern: `{prefix}-{split_no:05d}-of-{split_count:05d}.gguf`
+    ///
+    /// # Arguments
+    ///
+    /// * `path_prefix` - The base path and filename prefix
+    /// * `split_no` - The split number (1-indexed)
+    /// * `split_count` - The total number of splits
+    ///
+    /// # Returns
+    ///
+    /// Returns the formatted split path as a String
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use llama_cpp_2::model::LlamaModel;
+    ///
+    /// let path = LlamaModel::split_path("/models/llama", 2, 4);
+    /// assert_eq!(path, "/models/llama-00002-of-00004.gguf");
+    /// ```
+    pub fn split_path(path_prefix: &str, split_no: i32, split_count: i32) -> String {
+        let mut buffer = vec![0u8; 1024];
+        let path_prefix_cstr = CString::new(path_prefix).unwrap_or_else(|_| CString::new("").unwrap());
+        let len = unsafe {
+            llama_cpp_sys_2::llama_split_path(
+                buffer.as_mut_ptr() as *mut c_char,
+                buffer.len(),
+                path_prefix_cstr.as_ptr(),
+                split_no,
+                split_count,
+            )
+        };
+        
+        if len > 0 && len < buffer.len() as i32 {
+            buffer.truncate(len as usize);
+            String::from_utf8(buffer).unwrap_or_else(|_| String::new())
+        } else {
+            String::new()
+        }
+    }
+
+    /// Extract the path prefix from a split filename.
+    ///
+    /// This function extracts the base path prefix from a split model filename,
+    /// but only if the split_no and split_count match the pattern in the filename.
+    ///
+    /// # Arguments
+    ///
+    /// * `split_path` - The full path to a split file
+    /// * `split_no` - The expected split number
+    /// * `split_count` - The expected total number of splits
+    ///
+    /// # Returns
+    ///
+    /// Returns `Some(prefix)` if the split pattern matches, `None` otherwise
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use llama_cpp_2::model::LlamaModel;
+    ///
+    /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 2, 4);
+    /// assert_eq!(prefix, Some("/models/llama".to_string()));
+    ///
+    /// // Returns None if the pattern doesn't match
+    /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 3, 4);
+    /// assert_eq!(prefix, None);
+    /// ```
+    pub fn split_prefix(split_path: &str, split_no: i32, split_count: i32) -> Option<String> {
+        let mut buffer = vec![0u8; 1024];
+        let split_path_cstr = CString::new(split_path).ok()?;
+        let len = unsafe {
+            llama_cpp_sys_2::llama_split_prefix(
+                buffer.as_mut_ptr() as *mut c_char,
+                buffer.len(),
+                split_path_cstr.as_ptr(),
+                split_no,
+                split_count,
+            )
+        };
+        
+        if len > 0 && len < buffer.len() as i32 {
+            buffer.truncate(len as usize);
+            String::from_utf8(buffer).ok()
+        } else {
+            None
+        }
+    }
+
     /// Initializes a lora adapter from a file.
     ///
     /// # Errors

From b858f12d446cfe59f2da569877c64b9d6d28f12b Mon Sep 17 00:00:00 2001
From: Andy <andy+github@savage.hk>
Date: Sat, 23 Aug 2025 16:58:29 +0800
Subject: [PATCH 2/4] fix: Remove unused import and clean up workspace members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove unused Path import from split_model example
- Remove RPC example from workspace members on split-model-loading branch

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 Cargo.lock                       | 9 +++++++++
 Cargo.toml                       | 1 -
 examples/split_model/src/main.rs | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4decebc7..0c9794c0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1121,6 +1121,15 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
+[[package]]
+name = "split_model"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "llama-cpp-2",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
diff --git a/Cargo.toml b/Cargo.toml
index 011bc34e..3d5feb51 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,6 @@ members = [
   "examples/reranker",
   "examples/mtmd",
   "examples/split_model",
-  "examples/rpc",
 ]
 
 [workspace.dependencies]
diff --git a/examples/split_model/src/main.rs b/examples/split_model/src/main.rs
index 92f141b7..324c3a4a 100644
--- a/examples/split_model/src/main.rs
+++ b/examples/split_model/src/main.rs
@@ -16,7 +16,7 @@ use llama_cpp_2::{
 };
 use std::io::{self, Write};
 use std::num::NonZeroU32;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 
 /// Command line arguments for the split model example
 #[derive(Parser, Debug)]

From 8f3cb9b9b0ea0f8cdd0fd9399cfb4b4feac1cf93 Mon Sep 17 00:00:00 2001
From: Andy <andy+github@savage.hk>
Date: Sat, 23 Aug 2025 17:06:29 +0800
Subject: [PATCH 3/4] docs: Add missing RopeType variant documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added documentation comments for RopeType enum variants
- Ensures all public APIs are properly documented

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 llama-cpp-2/src/model.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 3490b466..ee85d066 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -95,9 +95,13 @@ impl LlamaChatMessage {
 /// The Rope type that's used within the model.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum RopeType {
+    /// Normal RoPE (Rotary Position Embedding)
     Norm,
+    /// GPT-NeoX style RoPE
     NeoX,
+    /// Multi-resolution RoPE
     MRope,
+    /// Vision model RoPE
     Vision,
 }
 

From a0d8be5f02276af9a781e0e7efcbf315d3e19cd4 Mon Sep 17 00:00:00 2001
From: Andy <andy+github@savage.hk>
Date: Sat, 23 Aug 2025 18:10:44 +0800
Subject: [PATCH 4/4] feat(llama-cpp-sys-2): add dynamic tools CMakeLists.txt
 generation for split-model-loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds the scalable dynamic tools building system to the split-model-loading branch:

- Adds generate_tools_cmake() function to dynamically create tools/CMakeLists.txt
- Only builds tools for enabled features (solving PR #806 issue)
- Split model loading doesn't require tools but maintains architecture consistency
- Includes tools/CMakeLists.txt in Cargo.toml for build system compatibility
- Uses feature-based conditional compilation for future extensibility

This creates a merge-friendly architecture where each feature branch can extend
tool building without conflicts.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 llama-cpp-sys-2/Cargo.toml |  1 +
 llama-cpp-sys-2/build.rs   | 61 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 0a52aba9..96c4bfb8 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -51,6 +51,7 @@ include = [
     "/llama.cpp/ggml/CMakeLists.txt",
     "/llama.cpp/ggml/src/CMakeLists.txt",
     "/llama.cpp/src/CMakeLists.txt",
+    "/llama.cpp/tools/CMakeLists.txt",
 
     "/llama.cpp/cmake",
     "/llama.cpp/ggml/cmake",
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 08fcd49b..2110ab33 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,6 +1,7 @@
 use cmake::Config;
 use glob::glob;
 use std::env;
+use std::fs;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use walkdir::DirEntry;
@@ -197,6 +198,52 @@ fn is_hidden(e: &DirEntry) -> bool {
         .unwrap_or_default()
 }
 
+/// Generate a dynamic tools CMakeLists.txt based on enabled features
+/// This approach allows each feature branch to add their own tool without conflicts
+fn generate_tools_cmake() {
+    let mut cmake_content = String::from(
+r#"# Auto-generated tools CMakeLists.txt based on enabled features
+# This file is created dynamically to only build tools for enabled features
+
+# dependencies
+find_package(Threads REQUIRED)
+
+# third-party
+# ...
+
+# flags
+llama_add_compile_flags()
+
+# tools - only build what's needed based on enabled features
+if (NOT EMSCRIPTEN)
+"#);
+
+    // Add tools based on enabled features
+    if cfg!(feature = "mtmd") {
+        cmake_content.push_str("    add_subdirectory(mtmd)\n");
+    }
+    
+    // Future feature branches can add their tools here:
+    // if cfg!(feature = "rpc") {
+    //     cmake_content.push_str("    add_subdirectory(rpc)\n");
+    // }
+    // if cfg!(feature = "server") {
+    //     cmake_content.push_str("    add_subdirectory(server)\n");
+    // }
+    // if cfg!(feature = "quantize") {
+    //     cmake_content.push_str("    add_subdirectory(quantize)\n");
+    // }
+    
+    // Split model loading doesn't need any tools - it's just a library feature
+
+    cmake_content.push_str("endif()\n");
+    
+    // Write the generated CMakeLists.txt
+    let tools_cmake_path = Path::new("llama.cpp/tools/CMakeLists.txt");
+    fs::write(tools_cmake_path, cmake_content)
+        .expect("Failed to write generated tools CMakeLists.txt");
+}
+
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
 
@@ -448,10 +495,20 @@ fn main() {
     config.define("LLAMA_BUILD_TOOLS", "OFF");
     config.define("LLAMA_CURL", "OFF");
 
-    if cfg!(feature = "mtmd") {
+    // Generate dynamic tools CMakeLists.txt based on enabled tool features
+    let any_tool_features = cfg!(feature = "mtmd")
+        // Future tool features can be added here by other branches:
+        // || cfg!(feature = "rpc")
+        // || cfg!(feature = "server")
+        // || cfg!(feature = "quantize")
+        ;
+    
+    if any_tool_features {
         config.define("LLAMA_BUILD_COMMON", "ON");
-        // mtmd support in llama-cpp is within the tools directory
         config.define("LLAMA_BUILD_TOOLS", "ON");
+        
+        // Generate the tools CMakeLists.txt with only enabled features
+        generate_tools_cmake();
     }
 
     // Pass CMAKE_ environment variables down to CMake