diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index d237cb3dc52..c707ce70a79 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1701,6 +1701,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4], "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], + "DGX_H100-4_GPUs-Triton-[Post-Merge]-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "A10-PyTorch-1": ["a10", "l0_a10", 1, 1], "A10-CPP-1": ["a10", "l0_a10", 1, 1], diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py index 91cda365dda..135209d79e2 100644 --- a/tests/integration/defs/triton_server/test_triton_llm.py +++ b/tests/integration/defs/triton_server/test_triton_llm.py @@ -3627,8 +3627,7 @@ def test_benchmark_core_model( ids=["disableDecoupleMode", "enableDecoupleMode"]) # TODO: [JIRA-4496] Add batch support in llmapi backend and add tests here. @pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["0"]) -# TODO: [JIRA-4040] Add more tensor parallel size -@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1"]) +@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1", "4"]) def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE, TENSOR_PARALLEL_SIZE, llm_backend_inflight_batcher_llm_root, llm_backend_venv, @@ -3644,6 +3643,8 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE, model_config = yaml.safe_load(f) model_config["triton_config"]["decoupled"] = DECOUPLED_MODE model_config["triton_config"]["max_batch_size"] = int(TRITON_MAX_BATCH_SIZE) + model_config["tensor_parallel_size"] = int(TENSOR_PARALLEL_SIZE) + model_config["kv_cache_config"] = {"free_gpu_memory_fraction": 0.8} with open(model_config_path, "w") as f: yaml.dump(model_config, f) @@ -3653,12 +3654,14 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE, # Launch Triton Server launch_server_py = os.path.join(llm_backend_repo_root, "scripts", "launch_triton_server.py") - print_info( - f"DEBUG:: launch_server with args: python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi" - ) - check_call( - f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi", - shell=True) + cmd = f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo}" + if TENSOR_PARALLEL_SIZE == "4": + cmd += " --trtllm_llmapi_launch" + cmd += " --oversubscribe" + else: + cmd += " --no-mpi" + print_info(f"DEBUG:: launch_server with args: {cmd}") + check_call(cmd, shell=True) check_server_ready() # Speed up the test by running multiple tests with different configurations sharing the same triton server. diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 81189b73f2d..e6f79690194 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -136,3 +136,17 @@ l0_dgx_h100: - cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90] - cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90) - cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90] +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: post_merge + backend: triton + tests: + - triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] diff --git a/triton_backend/scripts/launch_triton_server.py b/triton_backend/scripts/launch_triton_server.py index b31986b9610..2af7bd426c5 100644 --- a/triton_backend/scripts/launch_triton_server.py +++ b/triton_backend/scripts/launch_triton_server.py @@ -97,6 +97,13 @@ def parse_arguments(): 'Append --oversubscribe to the mpirun command. Mainly for SLURM MPI usecases.' ) + parser.add_argument( + '--trtllm_llmapi_launch', + action='store_true', + help='Launch tritonserver with trtllm-llmapi-launch', + default=False, + ) + return parser.parse_args() @@ -147,7 +154,7 @@ def add_port_config(cmd, grpc_port, http_port, metrics_port): def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port, model_repo, log, log_file, tensorrt_llm_model_name, oversubscribe, - multimodal_gpu0_cuda_mem_pool_bytes, no_mpi): + multimodal_gpu0_cuda_mem_pool_bytes, no_mpi, trtllm_llmapi_launch): if no_mpi: assert world_size == 1, "world size must be 1 when using no-mpi" @@ -162,6 +169,8 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port, for i in range(world_size): if use_mpi: cmd += ['-n', '1'] + if trtllm_llmapi_launch: + cmd += ['trtllm-llmapi-launch'] cmd += [tritonserver, f'--model-repository={model_repo}'] # Add port configuration @@ -212,7 +221,7 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port, args.http_port, args.metrics_port, args.model_repo, args.log, args.log_file, args.tensorrt_llm_model_name, args.oversubscribe, args.multimodal_gpu0_cuda_mem_pool_bytes, - args.no_mpi) + args.no_mpi, args.trtllm_llmapi_launch) env = os.environ.copy() if args.multi_model: if not args.disable_spawn_processes: