A Swift package to interact with local Large Language Models (LLMs) on Apple platforms.
![]() |
![]() |
Demo / Multimodal
MobileVLM-3B (llama.cpp) | Qwen2.5 VL 3B (MLX) |
---|---|
llamacpp-mobilevlm.mov |
mlx-qwen2.5.mov |
iPhone 16 Pro
Important
This project is still experimental. The API is subject to change.
Tip
To run larger models more reliably, consider adding com.apple.developer.kernel.increased-memory-limit
entitlement to your app.
- Support for GGUF / MLX models / FoundationModels framework
- Support for iOS, macOS and Linux
- Streaming API
- Multimodal (experimental)
- Tool calling (experimental)
Add the following dependency to your Package.swift
file:
dependencies: [
.package(url: "https://github.com/tattn/LocalLLMClient.git", branch: "main")
]
The API documentation is available here.
import LocalLLMClient
import LocalLLMClientLlama
let session = LLMSession(model: .llama(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
model: "gemma-3-4B-it-QAT-Q4_0.gguf"
))
print(try await session.respond(to: "Tell me a joke."))
for try await text in session.streamResponse(to: "Write a story about cats.") {
print(text, terminator: "")
}
Using llama.cpp
import LocalLLMClient
import LocalLLMClientLlama
// Create a model
let model = LLMSession.DownloadModel.llama(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
model: "gemma-3-4B-it-QAT-Q4_0.gguf",
parameter: .init(
temperature: 0.7, // Randomness (0.0〜1.0)
topK: 40, // Top-K sampling
topP: 0.9, // Top-P (nucleus) sampling
options: .init(responseFormat: .json) // Response format
)
)
// You can track download progress
try await model.downloadModel { progress in
print("Download progress: \(progress)")
}
// Create a session with the downloaded model
let session = LLMSession(model: model)
// Generate a response with a specific prompt
let response = try await session.respond(to: """
Create the beginning of a synopsis for an epic story with a cat as the main character.
Format it in JSON, as shown below.
{
"title": "<title>",
"content": "<content>",
}
""")
print(response)
// You can also add system messages before asking questions
session.messages = [.system("You are a helpful assistant.")]
Using Apple MLX
import LocalLLMClient
import LocalLLMClientMLX
// Create a model
let model = LLMSession.DownloadModel.mlx(
id: "mlx-community/Qwen3-1.7B-4bit",
parameter: .init(
temperature: 0.7, // Randomness (0.0 to 1.0)
topP: 0.9 // Top-P (nucleus) sampling
)
)
// You can track download progress
try await model.downloadModel { progress in
print("Download progress: \(progress)")
}
// Create a session with the downloaded model
let session = LLMSession(model: model)
// Generate text with system and user messages
session.messages = [.system("You are a helpful assistant.")]
let response = try await session.respond(to: "Tell me a story about a cat.")
print(response)
Using Apple FoundationModels
import LocalLLMClient
import LocalLLMClientFoundationModels
// Available on iOS 26.0+ / macOS 26.0+ and requires Apple Intelligence
let session = LLMSession(model: .foundationModels(
// Use system's default model
model: .default,
// Configure generation options
parameter: .init(
temperature: 0.7,
)
))
// Generate a response with a specific prompt
let response = try await session.respond(to: "Tell me a short story about a clever fox.")
print(response)
LocalLLMClient supports tool calling for integrations with external systems.
Important
Tool calling is only available with models that support this feature. Each backend has different model compatibility.
Make sure your chosen model explicitly supports tool calling before using this feature.
Using tool calling
import LocalLLMClient
import LocalLLMClientLlama
@Tool("get_weather")
struct GetWeatherTool {
let description = "Get the current weather in a given location"
@ToolArguments
struct Arguments {
@ToolArgument("The city and state, e.g. San Francisco, CA")
var location: String
@ToolArgument("Temperature unit")
var unit: Unit?
@ToolArgumentEnum
enum Unit: String {
case celsius
case fahrenheit
}
}
func call(arguments: Arguments) async throws -> ToolOutput {
// In a real implementation, this would call a weather API
let temp = arguments.unit == .celsius ? "22°C" : "72°F"
return ToolOutput([
"location": arguments.location,
"temperature": temp,
"condition": "sunny"
])
}
}
// Create the tool
let weatherTool = GetWeatherTool()
// Create a session with a model that supports tool calling and register tools
let session = LLMSession(
model: .llama(
id: "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
),
tools: [weatherTool]
)
// Ask a question that requires tool use
let response = try await session.respond(to: "What's the weather like in Tokyo?")
print(response)
// The model will automatically call the weather tool and include the result in its response
LocalLLMClient also supports multimodal models for processing images.
Using with llama.cpp
import LocalLLMClient
import LocalLLMClientLlama
// Create a session with a multimodal model
let session = LLMSession(model: .llama(
id: "ggml-org/gemma-3-4b-it-GGUF",
model: "gemma-3-4b-it-Q8_0.gguf",
mmproj: "mmproj-model-f16.gguf"
))
// Ask a question about an image
let response = try await session.respond(
to: "What's in this image?",
attachments: [.image(.init(resource: .yourImage))]
)
print(response)
// You can also stream the response
for try await text in session.streamResponse(
to: "Describe this image in detail",
attachments: [.image(.init(resource: .yourImage))]
) {
print(text, terminator: "")
}
Using with Apple MLX
import LocalLLMClient
import LocalLLMClientMLX
// Create a session with a multimodal model
let session = LLMSession(model: .mlx(
id: "mlx-community/Qwen2.5-VL-3B-Instruct-abliterated-4bit"
))
// Ask a question about an image
let response = try await session.respond(
to: "What's in this image?",
attachments: [.image(.init(resource: .yourImage))]
)
print(response)
For more advanced control over model loading and inference, you can use the LocalLLMClient
APIs directly.
Using with llama.cpp
import LocalLLMClient
import LocalLLMClientLlama
import LocalLLMClientUtility
// Download model from Hugging Face (Gemma 3)
let ggufName = "gemma-3-4B-it-QAT-Q4_0.gguf"
let downloader = FileDownloader(source: .huggingFace(
id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
globs: [ggufName]
))
try await downloader.download { print("Progress: \($0)") }
// Initialize a client with the downloaded model
let modelURL = downloader.destination.appending(component: ggufName)
let client = try await LocalLLMClient.llama(url: modelURL, parameter: .init(
context: 4096, // Context size
temperature: 0.7, // Randomness (0.0〜1.0)
topK: 40, // Top-K sampling
topP: 0.9, // Top-P (nucleus) sampling
options: .init(responseFormat: .json) // Response format
))
let prompt = """
Create the beginning of a synopsis for an epic story with a cat as the main character.
Format it in JSON, as shown below.
{
"title": "<title>",
"content": "<content>",
}
"""
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user(prompt)
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
Using with Apple MLX
import LocalLLMClient
import LocalLLMClientMLX
import LocalLLMClientUtility
// Download model from Hugging Face
let downloader = FileDownloader(
source: .huggingFace(id: "mlx-community/Qwen3-1.7B-4bit", globs: .mlx)
)
try await downloader.download { print("Progress: \($0)") }
// Initialize a client with the downloaded model
let client = try await LocalLLMClient.mlx(url: downloader.destination, parameter: .init(
temperature: 0.7, // Randomness (0.0 to 1.0)
topP: 0.9 // Top-P (nucleus) sampling
))
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user("Tell me a story about a cat.")
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
Using with Apple FoundationModels
import LocalLLMClient
import LocalLLMClientFoundationModels
// Available on iOS 26.0+ / macOS 26.0+ and requires Apple Intelligence
let client = try await LocalLLMClient.foundationModels(
// Use system's default model
model: .default,
// Configure generation options
parameter: .init(
temperature: 0.7,
)
)
// Generate text
let input = LLMInput.chat([
.system("You are a helpful assistant."),
.user("Tell me a short story about a clever fox.")
])
for try await text in try await client.textStream(from: input) {
print(text, terminator: "")
}
Advanced Multimodal with llama.cpp
import LocalLLMClient
import LocalLLMClientLlama
import LocalLLMClientUtility
// Download model from Hugging Face (Gemma 3)
let model = "gemma-3-4b-it-Q8_0.gguf"
let mmproj = "mmproj-model-f16.gguf"
let downloader = FileDownloader(
source: .huggingFace(id: "ggml-org/gemma-3-4b-it-GGUF", globs: [model, mmproj]),
)
try await downloader.download { print("Download: \($0)") }
// Initialize a client with the downloaded model
let client = try await LocalLLMClient.llama(
url: downloader.destination.appending(component: model),
mmprojURL: downloader.destination.appending(component: mmproj)
)
let input = LLMInput.chat([
.user("What's in this image?", attachments: [.image(.init(resource: .yourImage))]),
])
// Generate text without streaming
print(try await client.generateText(from: input))
Advanced Multimodal with Apple MLX
import LocalLLMClient
import LocalLLMClientMLX
import LocalLLMClientUtility
// Download model from Hugging Face (Qwen2.5 VL)
let downloader = FileDownloader(source: .huggingFace(
id: "mlx-community/Qwen2.5-VL-3B-Instruct-abliterated-4bit",
globs: .mlx
))
try await downloader.download { print("Progress: \($0)") }
let client = try await LocalLLMClient.mlx(url: downloader.destination)
let input = LLMInput.chat([
.user("What's in this image?", attachments: [.image(.init(resource: .yourImage))]),
])
// Generate text without streaming
print(try await client.generateText(from: input))
You can use LocalLLMClient directly from the terminal using the command line tool:
# Run using llama.cpp
swift run LocalLLMCLI --model /path/to/your/model.gguf "Your prompt here"
# Run using MLX
./scripts/run_mlx.sh --model https://huggingface.co/mlx-community/Qwen3-1.7B-4bit "Your prompt here"
- LLaMA 3
- Gemma 3 / 2
- Qwen 3 / 2
- Phi 4
Models compatible with llama.cpp backend
Models compatible with MLX backend
If you have a model that works, please open an issue or PR to add it to the list.
- iOS 16.0+ / macOS 14.0+
- Xcode 16.0+
This package uses llama.cpp, Apple's MLX and Foundation Models framework for model inference.