Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tensorrt_llm/commands/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _signal_handler_cleanup_child(signum, frame):

def get_llm_args(model: str,
tokenizer: Optional[str] = None,
backend: Optional[str] = None,
backend: str = "pytorch",
max_beam_width: int = BuildConfig.max_beam_width,
max_batch_size: int = BuildConfig.max_batch_size,
max_num_tokens: int = BuildConfig.max_num_tokens,
Expand Down Expand Up @@ -165,8 +165,8 @@ def launch_server(host: str,
help="Hostname of the server.")
@click.option("--port", type=int, default=8000, help="Port of the server.")
@click.option("--backend",
type=click.Choice(["pytorch"]),
default=None,
type=click.Choice(["pytorch", "trt"]),
default="pytorch",
help="Set to 'pytorch' for pytorch path. Default is cpp path.")
@click.option('--log_level',
type=click.Choice(severity_map.keys()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ hostname: localhost
port: 8000
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
free_gpu_memory_fraction: 0.25
backend: "trt"
context_servers:
num_instances: 1
tensor_parallel_size: 2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
hostname: localhost
port: 8000
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
backend: "trt"
context_servers:
num_instances: 0
generation_servers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ hostname: localhost
port: 8000
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
free_gpu_memory_fraction: 0.25
backend: "trt"
context_servers:
num_instances: 1
tensor_parallel_size: 1
Expand Down
11 changes: 3 additions & 8 deletions tests/integration/defs/stress_test/stress_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,11 @@ def test_run_stress_test(config, stress_time_timeout, backend,
"""
# Create a new ModelConfig with the backend parameter
# Convert 'trt' to None as expected by the ModelConfig
backend_param = None if backend == "trt" else backend

new_config = ModelConfig(model_dir=config.model_dir,
tp_size=config.tp_size,
memory_requirement=config.memory_requirement,
backend=backend_param)
backend=backend)

# Extract stress_time and stress_timeout from the tuple
stress_time, stress_timeout = stress_time_timeout
Expand Down Expand Up @@ -542,6 +541,8 @@ def stress_test(config,
str(config.tp_size),
"--pp_size",
str(test_server_config.pp_size),
"--backend",
config.backend,
]

# Only add ep_size parameter if it's not None
Expand All @@ -560,12 +561,6 @@ def stress_test(config,
extra_llm_options_path,
])

# Add backend option only if specified
# backend = None means trt backend
# backend = pytorch means pytorch backend
if config.backend:
server_cmd.extend(["--backend", config.backend])

# Log the command we're about to run
print_info(f"Running command: {' '.join(server_cmd)}")

Expand Down
43 changes: 0 additions & 43 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,13 +1407,7 @@ def test_openai_completions_example(llm_root, llm_venv, backend: str):

@pytest.mark.parametrize("backend", ["pytorch", "trt"])
def test_openai_chat_example(llm_root, llm_venv, backend: str):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd([
"-m", "pytest",
str(test_root / "_test_openai_chat.py"), "-k", backend
Expand All @@ -1435,21 +1429,14 @@ def test_openai_lora(llm_root, llm_venv):


def test_openai_chat_multimodal_example(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd(
["-m", "pytest",
str(test_root / "_test_openai_chat_multimodal.py")])


def test_openai_chat_structural_tag_example(llm_venv):
test_root = unittest_path() / "llmapi" / "apps"

llm_venv.run_cmd([
"-m", "pytest",
str(test_root / "_test_openai_chat_structural_tag.py")
Expand All @@ -1459,13 +1446,7 @@ def test_openai_chat_structural_tag_example(llm_venv):
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(40000)
def test_openai_multi_chat_example(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd(
["-m", "pytest",
str(test_root / "_test_openai_multi_chat.py")])
Expand All @@ -1475,13 +1456,7 @@ def test_openai_multi_chat_example(llm_root, llm_venv):
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
def test_openai_consistent_chat(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd(
["-m", "pytest",
str(test_root / "_test_openai_consistent_chat.py")])
Expand All @@ -1491,13 +1466,7 @@ def test_openai_consistent_chat(llm_root, llm_venv):
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd([
"-m", "pytest", "-k", "tp16pp1",
str(test_root / "_test_openai_multi_nodes.py")
Expand All @@ -1508,13 +1477,7 @@ def test_openai_multinodes_chat_tp16pp1(llm_root, llm_venv):
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd([
"-m", "pytest", "-k", "tp8pp2",
str(test_root / "_test_openai_multi_nodes.py")
Expand All @@ -1523,13 +1486,7 @@ def test_openai_multinodes_chat_tp8pp2(llm_root, llm_venv):

@pytest.mark.skip_less_device_memory(80000)
def test_trtllm_benchmark_serving(llm_root, llm_venv):
example_root = Path(os.path.join(llm_root, "examples", "apps"))
test_root = unittest_path() / "llmapi" / "apps"
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(example_root, "requirements.txt")
])

llm_venv.run_cmd(
["-m", "pytest",
str(test_root / "_test_trtllm_serve_benchmark.py")])
Expand Down
11 changes: 4 additions & 7 deletions tests/unittest/llmapi/apps/_test_openai_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ def model_name():
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"


@pytest.fixture(scope="module",
params=[None, 'pytorch'],
ids=["trt", "pytorch"])
@pytest.fixture(scope="module", params=["trt", "pytorch"])
def backend(request):
return request.param

Expand Down Expand Up @@ -67,10 +65,9 @@ def temp_extra_llm_api_options_file(request):
def server(model_name: str, backend: str, extra_llm_api_options: bool,
temp_extra_llm_api_options_file: str, num_postprocess_workers: int):
model_path = get_model_path(model_name)
if backend == "pytorch":
args = ["--backend", f"{backend}"]
else:
args = ["--max_beam_width", "4"]
args = ["--backend", f"{backend}"]
if backend == "trt":
args.extend(["--max_beam_width", "4"])
if extra_llm_api_options:
args.extend(
["--extra_llm_api_options", temp_extra_llm_api_options_file])
Expand Down
9 changes: 4 additions & 5 deletions tests/unittest/llmapi/apps/_test_openai_completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def model_name():
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"


@pytest.fixture(scope="module", params=["trt", 'pytorch'])
@pytest.fixture(scope="module", params=["trt", "pytorch"])
def backend(request):
return request.param

Expand All @@ -29,10 +29,9 @@ def num_postprocess_workers(request):
@pytest.fixture(scope="module")
def server(model_name: str, backend: str, num_postprocess_workers: int):
model_path = get_model_path(model_name)
if backend == "pytorch":
args = ["--backend", f"{backend}"]
else:
args = ["--max_beam_width", "4"]
args = ["--backend", f"{backend}"]
if backend == "trt":
args.extend(["--max_beam_width", "4"])
args.extend(["--num_postprocess_workers", f"{num_postprocess_workers}"])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
Expand Down
1 change: 0 additions & 1 deletion tests/unittest/llmapi/apps/_test_openai_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def client():
llm = PyTorchLLM(model=llama_model_path,
build_config=build_config,
kv_cache_config=KvCacheConfig(),
backend="pytorch",
enable_iter_perf_stats=True)
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)

Expand Down
20 changes: 7 additions & 13 deletions tests/unittest/llmapi/apps/_test_openai_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@ def model_name():
return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"


@pytest.fixture(scope="module", params=["trt", 'pytorch'])
@pytest.fixture(scope="module", params=["trt", "pytorch"])
def backend(request):
return request.param


@pytest.fixture(scope="module", params=['8'])
@pytest.fixture(scope="module", params=["8"])
def max_batch_size(request):
return request.param


@pytest.fixture(scope="module", params=['80000'])
@pytest.fixture(scope="module", params=["80000"])
def max_seq_len(request):
return request.param

Expand All @@ -34,19 +34,13 @@ def max_seq_len(request):
def server(model_name: str, backend: str, max_batch_size: str,
max_seq_len: str):
model_path = get_model_path(model_name)
args = []
if backend == "pytorch":
args.append("--backend")
args.append(backend)
args = ["--backend", f"{backend}"]
if backend != "pytorch":
args.append("--max_beam_width")
args.append("4")
args.extend(["--max_beam_width", "4"])
if max_batch_size is not None:
args.append("--max_batch_size")
args.append(max_batch_size)
args.extend(["--max_batch_size", max_batch_size])
if max_seq_len is not None:
args.append("--max_seq_len")
args.append(max_seq_len)
args.extend(["--max_seq_len", max_seq_len])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server

Expand Down
15 changes: 5 additions & 10 deletions tests/unittest/llmapi/apps/_test_openai_multi_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ def model_name():
return "llama-models-v3/llama-v3-8b-instruct-hf"


@pytest.fixture(scope="module",
params=[None, 'pytorch'],
ids=["trt", "pytorch"])
@pytest.fixture(scope="module", params=["trt", "pytorch"])
def backend(request):
return request.param

Expand Down Expand Up @@ -55,13 +53,10 @@ def temp_extra_llm_api_options_file(request):
def server(model_name: str, backend: str, extra_llm_api_options: bool,
temp_extra_llm_api_options_file: str):
model_path = get_model_path(model_name)
args = ["--tp_size", "2", "--max_beam_width", "1"]
if backend is not None:
args.append("--backend")
args.append(backend)
args = ["--tp_size", "2", "--max_beam_width", "1", "--backend", backend]
if extra_llm_api_options:
args.append("--extra_llm_api_options")
args.append(temp_extra_llm_api_options_file)
args.extend(
["--extra_llm_api_options", temp_extra_llm_api_options_file])
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server

Expand Down Expand Up @@ -95,7 +90,7 @@ def test_chat_tp2(client: openai.OpenAI, model_name: str):
assert len(chat_completion.choices) == 1
assert chat_completion.usage.completion_tokens == 1
message = chat_completion.choices[0].message
assert message.content == 'Two'
assert message.content == "Two"


@skip_single_gpu
Expand Down
15 changes: 10 additions & 5 deletions tests/unittest/llmapi/apps/_test_openai_multi_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,17 @@ def server(model_name: str, backend: str, tp_pp_size: tuple):
tp_size, pp_size = tp_pp_size
device_count = torch.cuda.device_count()
args = [
"--tp_size", f"{tp_size}", "--pp_size", f"{pp_size}", "--gpus_per_node",
f"{device_count}", "--kv_cache_free_gpu_memory_fraction", "0.95"
"--tp_size",
f"{tp_size}",
"--pp_size",
f"{pp_size}",
"--gpus_per_node",
f"{device_count}",
"--kv_cache_free_gpu_memory_fraction",
"0.95",
"--backend",
backend,
]
if backend is not None:
args.append("--backend")
args.append(backend)
with RemoteOpenAIServer(model_path, args, llmapi_launch=True,
port=8001) as remote_server:
yield remote_server
Expand Down
Loading