Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ message Reply {

message GrammarTrigger {
string word = 1;
bool at_start = 2;
bool at_start = 2;
}

message ModelOptions {
Expand Down Expand Up @@ -229,6 +229,11 @@ message ModelOptions {
int32 MaxModelLen = 54;
int32 TensorParallelSize = 55;
string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;

string MMProj = 41;

Expand Down
17 changes: 14 additions & 3 deletions backend/python/vllm/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ async def LoadModel(self, request, context):
engine_args.swap_space = request.SwapSpace
if request.MaxModelLen != 0:
engine_args.max_model_len = request.MaxModelLen
if request.DisableLogStatus:
engine_args.disable_log_status = request.DisableLogStatus
if request.DType != "":
engine_args.dtype = request.DType
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
engine_args.limit_mm_per_prompt = {
"image": max(request.LimitImagePerPrompt, 1),
"video": max(request.LimitVideoPerPrompt, 1),
"audio": max(request.LimitAudioPerPrompt, 1)
}

try:
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
Expand Down Expand Up @@ -269,7 +280,7 @@ async def _predict(self, request, context, streaming=False):
def load_image(self, image_path: str):
"""
Load an image from the given file path or base64 encoded data.

Args:
image_path (str): The path to the image file or base64 encoded data.

Expand All @@ -288,7 +299,7 @@ def load_image(self, image_path: str):
def load_video(self, video_path: str):
"""
Load a video from the given file path.

Args:
video_path (str): The path to the image file.

Expand Down Expand Up @@ -335,4 +346,4 @@ async def serve(address):
)
args = parser.parse_args()

asyncio.run(serve(args.addr))
asyncio.run(serve(args.addr))
6 changes: 6 additions & 0 deletions core/backend/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
SwapSpace: int32(c.SwapSpace),
MaxModelLen: int32(c.MaxModelLen),
TensorParallelSize: int32(c.TensorParallelSize),
DisableLogStatus: c.DisableLogStatus,
DType: c.DType,
// LimitMMPerPrompt vLLM
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
Expand Down
48 changes: 29 additions & 19 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,25 +130,28 @@ type LLMConfig struct {
TrimSpace []string `yaml:"trimspace"`
TrimSuffix []string `yaml:"trimsuffix"`

ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
MMProj string `yaml:"mmproj"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
DType string `yaml:"dtype"` // vLLM
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
MMProj string `yaml:"mmproj"`

FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
Expand All @@ -166,6 +169,13 @@ type LLMConfig struct {
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
}

// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
type LimitMMPerPrompt struct {
LimitImagePerPrompt int `yaml:"image"`
LimitVideoPerPrompt int `yaml:"video"`
LimitAudioPerPrompt int `yaml:"audio"`
}

// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Expand Down
9 changes: 9 additions & 0 deletions gallery/vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ config_file: |
use_tokenizer_template: true
# Uncomment to specify a quantization method (optional)
# quantization: "awq"
# Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
# dtype: "float16"
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
# gpu_memory_utilization: 0.5
# Uncomment to trust remote code from huggingface
Expand All @@ -30,3 +32,10 @@ config_file: |
# Allows you to partition and run large models. Performance gains are limited.
# https://github.com/vllm-project/vllm/issues/1435
# tensor_parallel_size: 2
# Uncomment to disable log stats
# disable_log_stats: true
# Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
# limit_mm_per_prompt:
# image: 2
# video: 2
# audio: 2
Loading