Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-Triton-[Post-Merge]-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
"A10-CPP-1": ["a10", "l0_a10", 1, 1],
Expand Down
19 changes: 11 additions & 8 deletions tests/integration/defs/triton_server/test_triton_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3627,8 +3627,7 @@ def test_benchmark_core_model(
ids=["disableDecoupleMode", "enableDecoupleMode"])
# TODO: [JIRA-4496] Add batch support in llmapi backend and add tests here.
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["0"])
# TODO: [JIRA-4040] Add more tensor parallel size
@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1"])
@pytest.mark.parametrize("TENSOR_PARALLEL_SIZE", ["1", "4"])
def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
TENSOR_PARALLEL_SIZE,
llm_backend_inflight_batcher_llm_root, llm_backend_venv,
Expand All @@ -3644,6 +3643,8 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
model_config = yaml.safe_load(f)
model_config["triton_config"]["decoupled"] = DECOUPLED_MODE
model_config["triton_config"]["max_batch_size"] = int(TRITON_MAX_BATCH_SIZE)
model_config["tensor_parallel_size"] = int(TENSOR_PARALLEL_SIZE)
model_config["kv_cache_config"] = {"free_gpu_memory_fraction": 0.8}
with open(model_config_path, "w") as f:
yaml.dump(model_config, f)

Expand All @@ -3653,12 +3654,14 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
# Launch Triton Server
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
"launch_triton_server.py")
print_info(
f"DEBUG:: launch_server with args: python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi"
)
check_call(
f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo} --no-mpi",
shell=True)
cmd = f"python3 {launch_server_py} --world_size={TENSOR_PARALLEL_SIZE} --model_repo={new_model_repo}"
if TENSOR_PARALLEL_SIZE == "4":
cmd += " --trtllm_llmapi_launch"
cmd += " --oversubscribe"
else:
cmd += " --no-mpi"
print_info(f"DEBUG:: launch_server with args: {cmd}")
check_call(cmd, shell=True)
check_server_ready()

# Speed up the test by running multiple tests with different configurations sharing the same triton server.
Expand Down
14 changes: 14 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,17 @@ l0_dgx_h100:
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90)
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90]
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: triton
tests:
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]
13 changes: 11 additions & 2 deletions triton_backend/scripts/launch_triton_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ def parse_arguments():
'Append --oversubscribe to the mpirun command. Mainly for SLURM MPI usecases.'
)

parser.add_argument(
'--trtllm_llmapi_launch',
action='store_true',
help='Launch tritonserver with trtllm-llmapi-launch',
default=False,
)

return parser.parse_args()


Expand Down Expand Up @@ -147,7 +154,7 @@ def add_port_config(cmd, grpc_port, http_port, metrics_port):

def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
model_repo, log, log_file, tensorrt_llm_model_name, oversubscribe,
multimodal_gpu0_cuda_mem_pool_bytes, no_mpi):
multimodal_gpu0_cuda_mem_pool_bytes, no_mpi, trtllm_llmapi_launch):
if no_mpi:
assert world_size == 1, "world size must be 1 when using no-mpi"

Expand All @@ -162,6 +169,8 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
for i in range(world_size):
if use_mpi:
cmd += ['-n', '1']
if trtllm_llmapi_launch:
cmd += ['trtllm-llmapi-launch']
cmd += [tritonserver, f'--model-repository={model_repo}']

# Add port configuration
Expand Down Expand Up @@ -212,7 +221,7 @@ def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
args.http_port, args.metrics_port, args.model_repo, args.log,
args.log_file, args.tensorrt_llm_model_name,
args.oversubscribe, args.multimodal_gpu0_cuda_mem_pool_bytes,
args.no_mpi)
args.no_mpi, args.trtllm_llmapi_launch)
env = os.environ.copy()
if args.multi_model:
if not args.disable_spawn_processes:
Expand Down