diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index df96a1868ca..7de263ea89f 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -71,7 +71,7 @@ def _signal_handler_cleanup_child(signum, frame): def get_llm_args(model: str, tokenizer: Optional[str] = None, - backend: Optional[str] = None, + backend: str = "pytorch", max_beam_width: int = BuildConfig.max_beam_width, max_batch_size: int = BuildConfig.max_batch_size, max_num_tokens: int = BuildConfig.max_num_tokens, @@ -165,8 +165,8 @@ def launch_server(host: str, help="Hostname of the server.") @click.option("--port", type=int, default=8000, help="Port of the server.") @click.option("--backend", - type=click.Choice(["pytorch"]), - default=None, + type=click.Choice(["pytorch", "trt"]), + default="pytorch", help="Set to 'pytorch' for pytorch path. Default is cpp path.") @click.option('--log_level', type=click.Choice(severity_map.keys()), diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml index bde3132f8a1..388be9d4d66 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml @@ -2,6 +2,7 @@ hostname: localhost port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.25 +backend: "trt" context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml index 386a8fba01f..6d9fc7d07fd 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml @@ -1,6 +1,7 @@ hostname: localhost port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +backend: "trt" context_servers: num_instances: 0 generation_servers: diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml index fa57d987de4..885991c886c 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml @@ -2,6 +2,7 @@ hostname: localhost port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.25 +backend: "trt" context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index f0f85fe51e3..03456d8d5c5 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -364,12 +364,11 @@ def test_run_stress_test(config, stress_time_timeout, backend, """ # Create a new ModelConfig with the backend parameter # Convert 'trt' to None as expected by the ModelConfig - backend_param = None if backend == "trt" else backend new_config = ModelConfig(model_dir=config.model_dir, tp_size=config.tp_size, memory_requirement=config.memory_requirement, - backend=backend_param) + backend=backend) # Extract stress_time and stress_timeout from the tuple stress_time, stress_timeout = stress_time_timeout @@ -542,6 +541,8 @@ def stress_test(config, str(config.tp_size), "--pp_size", str(test_server_config.pp_size), + "--backend", + config.backend, ] # Only add ep_size parameter if it's not None @@ -560,12 +561,6 @@ def stress_test(config, extra_llm_options_path, ]) - # Add backend option only if specified - # backend = None means trt backend - # backend = pytorch means pytorch backend - if config.backend: - server_cmd.extend(["--backend", config.backend]) - # Log the command we're about to run print_info(f"Running command: {' '.join(server_cmd)}") diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 1e8098330f4..d0674717e2e 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1407,13 +1407,7 @@ def test_openai_completions_example(llm_root, llm_venv, backend: str): @pytest.mark.parametrize("backend", ["pytorch", "trt"]) def test_openai_chat_example(llm_root, llm_venv, backend: str): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd([ "-m", "pytest", str(test_root / "_test_openai_chat.py"), "-k", backend @@ -1435,13 +1429,7 @@ def test_openai_lora(llm_root, llm_venv): def test_openai_chat_multimodal_example(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd( ["-m", "pytest", str(test_root / "_test_openai_chat_multimodal.py")]) @@ -1449,7 +1437,6 @@ def test_openai_chat_multimodal_example(llm_root, llm_venv): def test_openai_chat_structural_tag_example(llm_venv): test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ "-m", "pytest", str(test_root / "_test_openai_chat_structural_tag.py") @@ -1459,13 +1446,7 @@ def test_openai_chat_structural_tag_example(llm_venv): @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(40000) def test_openai_multi_chat_example(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd( ["-m", "pytest", str(test_root / "_test_openai_multi_chat.py")]) @@ -1475,13 +1456,7 @@ def test_openai_multi_chat_example(llm_root, llm_venv): @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(80000) def test_openai_consistent_chat(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd( ["-m", "pytest", str(test_root / "_test_openai_consistent_chat.py")]) @@ -1491,13 +1466,7 @@ def test_openai_consistent_chat(llm_root, llm_venv): @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(80000) def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd([ "-m", "pytest", "-k", "tp16pp1", str(test_root / "_test_openai_multi_nodes.py") @@ -1508,13 +1477,7 @@ def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv): @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(80000) def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd([ "-m", "pytest", "-k", "tp8pp2", str(test_root / "_test_openai_multi_nodes.py") @@ -1523,13 +1486,7 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv): @pytest.mark.skip_less_device_memory(80000) def test_trtllm_benchmark_serving(llm_root, llm_venv): - example_root = Path(os.path.join(llm_root, "examples", "apps")) test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd([ - "-m", "pip", "install", "-r", - os.path.join(example_root, "requirements.txt") - ]) - llm_venv.run_cmd( ["-m", "pytest", str(test_root / "_test_trtllm_serve_benchmark.py")]) diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py index aeea774e788..2306afe9456 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat.py @@ -20,9 +20,7 @@ def model_name(): return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -@pytest.fixture(scope="module", - params=[None, 'pytorch'], - ids=["trt", "pytorch"]) +@pytest.fixture(scope="module", params=["trt", "pytorch"]) def backend(request): return request.param @@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request): def server(model_name: str, backend: str, extra_llm_api_options: bool, temp_extra_llm_api_options_file: str, num_postprocess_workers: int): model_path = get_model_path(model_name) - if backend == "pytorch": - args = ["--backend", f"{backend}"] - else: - args = ["--max_beam_width", "4"] + args = ["--backend", f"{backend}"] + if backend == "trt": + args.extend(["--max_beam_width", "4"]) if extra_llm_api_options: args.extend( ["--extra_llm_api_options", temp_extra_llm_api_options_file]) diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py index 79b9b49a1a7..7beeff0179b 100644 --- a/tests/unittest/llmapi/apps/_test_openai_completions.py +++ b/tests/unittest/llmapi/apps/_test_openai_completions.py @@ -14,7 +14,7 @@ def model_name(): return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -@pytest.fixture(scope="module", params=["trt", 'pytorch']) +@pytest.fixture(scope="module", params=["trt", "pytorch"]) def backend(request): return request.param @@ -29,10 +29,9 @@ def num_postprocess_workers(request): @pytest.fixture(scope="module") def server(model_name: str, backend: str, num_postprocess_workers: int): model_path = get_model_path(model_name) - if backend == "pytorch": - args = ["--backend", f"{backend}"] - else: - args = ["--max_beam_width", "4"] + args = ["--backend", f"{backend}"] + if backend == "trt": + args.extend(["--max_beam_width", "4"]) args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"]) with RemoteOpenAIServer(model_path, args) as remote_server: yield remote_server diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py index 9d207ae4e9a..25047eea1ea 100755 --- a/tests/unittest/llmapi/apps/_test_openai_metrics.py +++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py @@ -21,7 +21,6 @@ def client(): llm = PyTorchLLM(model=llama_model_path, build_config=build_config, kv_cache_config=KvCacheConfig(), - backend="pytorch", enable_iter_perf_stats=True) hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path) diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py index 52c8ff98535..51e3d4f840c 100644 --- a/tests/unittest/llmapi/apps/_test_openai_misc.py +++ b/tests/unittest/llmapi/apps/_test_openai_misc.py @@ -15,17 +15,17 @@ def model_name(): return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -@pytest.fixture(scope="module", params=["trt", 'pytorch']) +@pytest.fixture(scope="module", params=["trt", "pytorch"]) def backend(request): return request.param -@pytest.fixture(scope="module", params=['8']) +@pytest.fixture(scope="module", params=["8"]) def max_batch_size(request): return request.param -@pytest.fixture(scope="module", params=['80000']) +@pytest.fixture(scope="module", params=["80000"]) def max_seq_len(request): return request.param @@ -34,19 +34,13 @@ def max_seq_len(request): def server(model_name: str, backend: str, max_batch_size: str, max_seq_len: str): model_path = get_model_path(model_name) - args = [] - if backend == "pytorch": - args.append("--backend") - args.append(backend) + args = ["--backend", f"{backend}"] if backend != "pytorch": - args.append("--max_beam_width") - args.append("4") + args.extend(["--max_beam_width", "4"]) if max_batch_size is not None: - args.append("--max_batch_size") - args.append(max_batch_size) + args.extend(["--max_batch_size", max_batch_size]) if max_seq_len is not None: - args.append("--max_seq_len") - args.append(max_seq_len) + args.extend(["--max_seq_len", max_seq_len]) with RemoteOpenAIServer(model_path, args) as remote_server: yield remote_server diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py b/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py index cff9962bfa6..6ac65c42b25 100644 --- a/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py +++ b/tests/unittest/llmapi/apps/_test_openai_multi_gpu.py @@ -15,9 +15,7 @@ def model_name(): return "llama-models-v3/llama-v3-8b-instruct-hf" -@pytest.fixture(scope="module", - params=[None, 'pytorch'], - ids=["trt", "pytorch"]) +@pytest.fixture(scope="module", params=["trt", "pytorch"]) def backend(request): return request.param @@ -55,13 +53,10 @@ def temp_extra_llm_api_options_file(request): def server(model_name: str, backend: str, extra_llm_api_options: bool, temp_extra_llm_api_options_file: str): model_path = get_model_path(model_name) - args = ["--tp_size", "2", "--max_beam_width", "1"] - if backend is not None: - args.append("--backend") - args.append(backend) + args = ["--tp_size", "2", "--max_beam_width", "1", "--backend", backend] if extra_llm_api_options: - args.append("--extra_llm_api_options") - args.append(temp_extra_llm_api_options_file) + args.extend( + ["--extra_llm_api_options", temp_extra_llm_api_options_file]) with RemoteOpenAIServer(model_path, args) as remote_server: yield remote_server @@ -95,7 +90,7 @@ def test_chat_tp2(client: openai.OpenAI, model_name: str): assert len(chat_completion.choices) == 1 assert chat_completion.usage.completion_tokens == 1 message = chat_completion.choices[0].message - assert message.content == 'Two' + assert message.content == "Two" @skip_single_gpu diff --git a/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py b/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py index eaea27597a9..7413745e51a 100644 --- a/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py +++ b/tests/unittest/llmapi/apps/_test_openai_multi_nodes.py @@ -48,12 +48,17 @@ def server(model_name: str, backend: str, tp_pp_size: tuple): tp_size, pp_size = tp_pp_size device_count = torch.cuda.device_count() args = [ - "--tp_size", f"{tp_size}", "--pp_size", f"{pp_size}", "--gpus_per_node", - f"{device_count}", "--kv_cache_free_gpu_memory_fraction", "0.95" + "--tp_size", + f"{tp_size}", + "--pp_size", + f"{pp_size}", + "--gpus_per_node", + f"{device_count}", + "--kv_cache_free_gpu_memory_fraction", + "0.95", + "--backend", + backend, ] - if backend is not None: - args.append("--backend") - args.append(backend) with RemoteOpenAIServer(model_path, args, llmapi_launch=True, port=8001) as remote_server: yield remote_server diff --git a/tests/unittest/llmapi/apps/_test_openai_reasoning.py b/tests/unittest/llmapi/apps/_test_openai_reasoning.py index b20c365c3e0..d5cd7eb9eec 100644 --- a/tests/unittest/llmapi/apps/_test_openai_reasoning.py +++ b/tests/unittest/llmapi/apps/_test_openai_reasoning.py @@ -14,19 +14,15 @@ def model_name() -> str: return "DeepSeek-R1-Distill-Qwen-1.5B" -@pytest.fixture(scope="module", - params=[None, 'pytorch'], - ids=["trt", "pytorch"]) +@pytest.fixture(scope="module", params=["trt", "pytorch"]) def backend(request): return request.param @pytest.fixture(scope="module") -def server(model_name: str, backend: str) -> RemoteOpenAIServer: +def server(model_name: str, backend: str): model_path = get_model_path(model_name) - args = [] - if backend == "pytorch": - args.extend(["--backend", f"{backend}"]) + args = ["--backend", f"{backend}"] max_beam_width = 1 if backend == "pytorch" else 2 args.extend(["--max_beam_width", str(max_beam_width)]) args.extend(["--max_batch_size", "2", "--max_seq_len", "1024"]) @@ -68,7 +64,7 @@ def test_reasoning_parser(client: openai.OpenAI, model_name: str, backend: str): @pytest.fixture(scope="module") -def oning_client(server: RemoteOpenAIServer) -> openai.OpenAI: +def async_client(server: RemoteOpenAIServer) -> openai.AsyncOpenAI: return server.get_async_client() @@ -90,10 +86,10 @@ async def process_stream( @pytest.mark.asyncio(loop_scope="module") -async def test_reasoning_parser_streaming(oning_client: openai.OpenAI, - model_name: str, backend: str): +async def test_reasoning_parser_streaming(async_client: openai.AsyncOpenAI, + model_name: str): messages = [{"role": "user", "content": "hi"}] - stream = await oning_client.chat.completions.create( + stream = await async_client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=1000, @@ -106,7 +102,7 @@ async def test_reasoning_parser_streaming(oning_client: openai.OpenAI, assert len(content_chunks) > 0 assert len(reasoning_content_chunks) > 0 - stream = await oning_client.chat.completions.create( + stream = await async_client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=1,