scaleapi · dmchoiboi · Oct 17, 2024 · Oct 17, 2024
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -143,6 +143,7 @@
 
 LLM_METADATA_KEY = "_llm"
 RESERVED_METADATA_KEYS = [LLM_METADATA_KEY, CONVERTED_FROM_ARTIFACT_LIKE_KEY]
+VLLM_MODEL_WEIGHTS_FOLDER = "model_files"
 
 INFERENCE_FRAMEWORK_REPOSITORY: Dict[LLMInferenceFramework, str] = {
     LLMInferenceFramework.DEEPSPEED: "instant-llm",
@@ -2792,6 +2793,10 @@ async def execute(
 
         validate_endpoint_supports_openai_completion(model_endpoint, endpoint_content)
 
+        # if inference framework is VLLM, we need to set the model to use the weights folder
+        if endpoint_content.inference_framework == LLMInferenceFramework.VLLM:
+            request.model = VLLM_MODEL_WEIGHTS_FOLDER
+
         inference_request = SyncEndpointPredictV1Request(
             args=request.model_dump(exclude_none=True),
             destination_path=OPENAI_COMPLETION_PATH,
@@ -2894,6 +2899,10 @@ async def execute(
 
         validate_endpoint_supports_openai_completion(model_endpoint, model_content)
 
+        # if inference framework is VLLM, we need to set the model to use the weights folder
+        if model_content.inference_framework == LLMInferenceFramework.VLLM:
+            request.model = VLLM_MODEL_WEIGHTS_FOLDER
+
         inference_request = SyncEndpointPredictV1Request(
             args=request.model_dump(exclude_none=True),
             destination_path=OPENAI_COMPLETION_PATH,
@@ -3051,6 +3060,10 @@ async def execute(
 
         validate_endpoint_supports_chat_completion(model_endpoint, endpoint_content)
 
+        # if inference framework is VLLM, we need to set the model to use the weights folder
+        if endpoint_content.inference_framework == LLMInferenceFramework.VLLM:
+            request.model = VLLM_MODEL_WEIGHTS_FOLDER
+
         inference_request = SyncEndpointPredictV1Request(
             args=request.model_dump(exclude_none=True),
             destination_path=OPENAI_CHAT_COMPLETION_PATH,
@@ -3152,6 +3165,10 @@ async def execute(
         )
         validate_endpoint_supports_chat_completion(model_endpoint, model_content)
 
+        # if inference framework is VLLM, we need to set the model to use the weights folder
+        if model_content.inference_framework == LLMInferenceFramework.VLLM:
+            request.model = VLLM_MODEL_WEIGHTS_FOLDER
+
         inference_request = SyncEndpointPredictV1Request(
             args=request.model_dump(exclude_none=True),
             destination_path=OPENAI_CHAT_COMPLETION_PATH,