NVIDIA · Superjomn · Oct 6, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
@@ -1550,7 +1550,7 @@ class BaseLlmArgs(StrictBaseModel):
                                       description="Return perf metrics.",
                                       status="prototype")
 
-    orchestrator_type: Optional[Literal["rpc"]] = Field(
+    orchestrator_type: Optional[Literal["rpc", "ray"]] = Field(
         default=None,
         description=
         "The orchestrator type to use. Defaults to None, which uses MPI.",
@@ -2444,13 +2444,6 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
-    orchestrator_type: Optional[Literal["ray"]] = Field(
-        default=None,
-        description=
-        "The orchestrator type to use. Options: 'ray'. Defaults to None, which uses MPI.",
-        status="prototype",
-    )
-
     # PrivateVars
     _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
 

@@ -75,10 +75,6 @@ methods:
         annotation: Optional[str]
         default: null
         status: deprecated
-      orchestrator_type:
-        annotation: Optional[Literal['ray']]
-        default: null
-        status: prototype
       build_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig]
         default: null
@@ -184,7 +180,7 @@ methods:
         default: False
         status: prototype
       orchestrator_type:
-        annotation: Optional[Literal["rpc"]]
+        annotation: Optional[Literal["rpc", "ray"]]
         default: null
         status: prototype
     return_annotation: None

diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -8,6 +8,7 @@
 from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness
 from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness
 from .test_llm import _test_llm_capture_request_error
+from utils.util import skip_ray
 # isort: on
 from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy
 from tensorrt_llm.sampling_params import SamplingParams
@@ -61,6 +62,8 @@ def test_llama_7b_multi_lora_tp2():
         cuda_graph_config=None)
 
 
+@pytest.mark.skip(reason="https://nvbugs/5560921")
+@skip_ray
 @pytest.mark.gpu2
 def test_llm_rpc_tp2():
     with LLM(model=llama_model_path,
@@ -78,6 +81,8 @@ def test_llm_rpc_tp2():
         assert len(res.outputs[0].token_ids) == 10
 
 
+@pytest.mark.skip(reason="https://nvbugs/5560921")
+@skip_ray
 @pytest.mark.gpu2
 @pytest.mark.asyncio
 async def test_llm_rpc_streaming_tp2():

diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -956,6 +956,8 @@ def test_max_num_token_check(self):
             llm.generate([ids])
 
 
+@pytest.mark.skip(reason="https://nvbugs/5560921")
+@skip_ray
 def test_llm_rpc():
     # TODO: remove the with-statement when shutdown hang issue is fixed
     with LLM(model=llama_model_path,
@@ -972,6 +974,8 @@ def test_llm_rpc():
         assert len(res.outputs[0].token_ids) == 10
 
 
+@pytest.mark.skip(reason="https://nvbugs/5560921")
+@skip_ray
 @pytest.mark.asyncio
 async def test_llm_rpc_streaming():
     # TODO: remove the with-statement when shutdown hang issue is fixed