diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index caad0d646b1..18f00e0a62a 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1550,7 +1550,7 @@ class BaseLlmArgs(StrictBaseModel): description="Return perf metrics.", status="prototype") - orchestrator_type: Optional[Literal["rpc"]] = Field( + orchestrator_type: Optional[Literal["rpc", "ray"]] = Field( default=None, description= "The orchestrator type to use. Defaults to None, which uses MPI.", @@ -2444,13 +2444,6 @@ class TorchLlmArgs(BaseLlmArgs): status="prototype", ) - orchestrator_type: Optional[Literal["ray"]] = Field( - default=None, - description= - "The orchestrator type to use. Options: 'ray'. Defaults to None, which uses MPI.", - status="prototype", - ) - # PrivateVars _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index d2c8d588894..0c4a583ddcf 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -75,10 +75,6 @@ methods: annotation: Optional[str] default: null status: deprecated - orchestrator_type: - annotation: Optional[Literal['ray']] - default: null - status: prototype build_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.BuildConfig] default: null @@ -184,7 +180,7 @@ methods: default: False status: prototype orchestrator_type: - annotation: Optional[Literal["rpc"]] + annotation: Optional[Literal["rpc", "ray"]] default: null status: prototype return_annotation: None diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index b5d34dcd735..b145122d176 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -8,6 +8,7 @@ from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness from .test_llm import _test_llm_capture_request_error +from utils.util import skip_ray # isort: on from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy from tensorrt_llm.sampling_params import SamplingParams @@ -61,6 +62,8 @@ def test_llama_7b_multi_lora_tp2(): cuda_graph_config=None) +@pytest.mark.skip(reason="https://nvbugs/5560921") +@skip_ray @pytest.mark.gpu2 def test_llm_rpc_tp2(): with LLM(model=llama_model_path, @@ -78,6 +81,8 @@ def test_llm_rpc_tp2(): assert len(res.outputs[0].token_ids) == 10 +@pytest.mark.skip(reason="https://nvbugs/5560921") +@skip_ray @pytest.mark.gpu2 @pytest.mark.asyncio async def test_llm_rpc_streaming_tp2(): diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index ae3df7e40f5..6c055641845 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -956,6 +956,8 @@ def test_max_num_token_check(self): llm.generate([ids]) +@pytest.mark.skip(reason="https://nvbugs/5560921") +@skip_ray def test_llm_rpc(): # TODO: remove the with-statement when shutdown hang issue is fixed with LLM(model=llama_model_path, @@ -972,6 +974,8 @@ def test_llm_rpc(): assert len(res.outputs[0].token_ids) == 10 +@pytest.mark.skip(reason="https://nvbugs/5560921") +@skip_ray @pytest.mark.asyncio async def test_llm_rpc_streaming(): # TODO: remove the with-statement when shutdown hang issue is fixed