NVIDIA · Tabrizian · Jun 27, 2025 · Jun 26, 2025
@@ -1701,6 +1701,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
         "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-Triton-[Post-Merge]-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
         "A10-CPP-1": ["a10", "l0_a10", 1, 1],

diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -3627,8 +3627,7 @@ def test_benchmark_core_model(
                          ids=["disableDecoupleMode", "enableDecoupleMode"])
 # TODO: [JIRA-4496] Add batch support in llmapi backend and add tests here.
 @pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["0"])
-# TODO: [JIRA-4040] Add more tensor parallel size
-@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1"])
+@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1", "4"])
 def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
                         TENSOR_PARALLEL_SIZE,
                         llm_backend_inflight_batcher_llm_root, llm_backend_venv,
@@ -3644,6 +3643,8 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
         model_config = yaml.safe_load(f)
     model_config["triton_config"]["decoupled"] = DECOUPLED_MODE
     model_config["triton_config"]["max_batch_size"] = int(TRITON_MAX_BATCH_SIZE)
+    model_config["tensor_parallel_size"] = int(TENSOR_PARALLEL_SIZE)
+    model_config["kv_cache_config"] = {"free_gpu_memory_fraction": 0.8}
     with open(model_config_path, "w") as f:
         yaml.dump(model_config, f)
 
@@ -3653,12 +3654,14 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
     # Launch Triton Server
     launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
                                     "launch_triton_server.py")
-    print_info(
-        f"DEBUG:: launch_server with args: python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi"
-    )
-    check_call(
-        f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi",
-        shell=True)
+    cmd = f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo}"
+    if TENSOR_PARALLEL_SIZE == "4":
+        cmd += " --trtllm_llmapi_launch"
+        cmd += " --oversubscribe"
+    else:
+        cmd += " --no-mpi"
+    print_info(f"DEBUG:: launch_server with args: {cmd}")
+    check_call(cmd, shell=True)
     check_server_ready()
 
     # Speed up the test by running multiple tests with different configurations sharing the same triton server.

diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -136,3 +136,17 @@ l0_dgx_h100:
   - cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
   - cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90)
   - cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: triton
+  tests:
+  - triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]
@@ -97,6 +97,13 @@ def parse_arguments():
         'Append --oversubscribe to the mpirun command. Mainly for SLURM MPI usecases.'
     )
 
+    parser.add_argument(
+        '--trtllm_llmapi_launch',
+        action='store_true',
+        help='Launch tritonserver with trtllm-llmapi-launch',
+        default=False,
+    )
+
     return parser.parse_args()
 
 
@@ -147,7 +154,7 @@ def add_port_config(cmd, grpc_port, http_port, metrics_port):
 
 def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
             model_repo, log, log_file, tensorrt_llm_model_name, oversubscribe,
-            multimodal_gpu0_cuda_mem_pool_bytes, no_mpi):
+            multimodal_gpu0_cuda_mem_pool_bytes, no_mpi, trtllm_llmapi_launch):
     if no_mpi:
         assert world_size == 1, "world size must be 1 when using no-mpi"
 
@@ -162,6 +169,8 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
     for i in range(world_size):
         if use_mpi:
             cmd += ['-n', '1']
+        if trtllm_llmapi_launch:
+            cmd += ['trtllm-llmapi-launch']
         cmd += [tritonserver, f'--model-repository={model_repo}']
 
         # Add port configuration
@@ -212,7 +221,7 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
                   args.http_port, args.metrics_port, args.model_repo, args.log,
                   args.log_file, args.tensorrt_llm_model_name,
                   args.oversubscribe, args.multimodal_gpu0_cuda_mem_pool_bytes,
-                  args.no_mpi)
+                  args.no_mpi, args.trtllm_llmapi_launch)
     env = os.environ.copy()
     if args.multi_model:
         if not args.disable_spawn_processes: