diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index 365e1e6b551..823488c7d72 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -16,6 +16,7 @@ import os import platform import re +import time from difflib import SequenceMatcher from pathlib import Path @@ -771,7 +772,9 @@ def test_multi_lora_support( zero_lora_weights=True, use_code_prompts=False, ): + start_time = time.time() print("Creating dummy LoRAs...") + lora_start = time.time() lora_paths = generate_dummy_loras( hf_model_dir=hf_model_dir, lora_output_dir=llm_venv.get_working_directory(), @@ -779,8 +782,13 @@ def test_multi_lora_support( lora_rank=lora_rank, target_modules=target_hf_modules, zero_weights=zero_lora_weights) + lora_end = time.time() + print( + f"Creating dummy LoRAs completed in {(lora_end - lora_start):.2f} seconds." + ) print("Build engines...") + build_start = time.time() build_cmd = [ "trtllm-build", f"--checkpoint_dir={tllm_ckpt_dir}", @@ -801,6 +809,9 @@ def test_multi_lora_support( "--max_beam_width=1", ] check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env) + build_end = time.time() + print( + f"Build engines completed in {(build_end - build_start):.2f} seconds.") if use_code_prompts: input_prompts = [ @@ -822,6 +833,7 @@ def test_multi_lora_support( ] print("Run inference with C++ runtime with pybind...") + inference_start = time.time() run_script = f"{example_root}/../../../run.py" if "core" in example_root else f"{example_root}/../run.py" run_cmd = [ run_script, @@ -842,6 +854,15 @@ def test_multi_lora_support( "--max_output_len=30", ] venv_check_call(llm_venv, run_cmd) + inference_end = time.time() + print( + f"Inference completed in {(inference_end - inference_start):.2f} seconds." + ) + + total_time = time.time() - start_time + print( + f"Total test_multi_lora_support execution time: {total_time:.2f} seconds" + ) def get_dummy_spec_decoding_heads(hf_model_dir, diff --git a/tests/integration/defs/examples/test_gemma.py b/tests/integration/defs/examples/test_gemma.py index 1759d7631bc..c0a6cbceafc 100644 --- a/tests/integration/defs/examples/test_gemma.py +++ b/tests/integration/defs/examples/test_gemma.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time from pathlib import Path import pytest @@ -429,7 +430,9 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root, batch_size=8): "Run Gemma models with multiple dummy LoRAs." + start_time = time.time() print("Convert checkpoint by modelopt...") + convert_start = time.time() kv_cache_dtype = 'fp8' if qformat == 'fp8' else 'int8' convert_cmd = [ f"{gemma_example_root}/../../../quantization/quantize.py", @@ -441,7 +444,13 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root, f"--output_dir={cmodel_dir}", ] venv_check_call(llm_venv, convert_cmd) + convert_end = time.time() + print( + f"Convert checkpoint completed in {(convert_end - convert_start):.2f} seconds." + ) + test_multi_lora_start = time.time() + print("Calling test_multi_lora_support...") test_multi_lora_support( hf_model_dir=gemma_model_root, tllm_ckpt_dir=cmodel_dir, @@ -454,3 +463,10 @@ def test_hf_gemma_fp8_base_bf16_multi_lora(gemma_model_root, target_trtllm_modules=["attn_q", "attn_k", "attn_v"], zero_lora_weights=True, ) + test_multi_lora_end = time.time() + print( + f"test_multi_lora_support completed in {(test_multi_lora_end - test_multi_lora_start):.2f} seconds" + ) + + total_time = time.time() - start_time + print(f"Total function execution time: {total_time:.2f} seconds") diff --git a/tests/integration/defs/examples/test_granite.py b/tests/integration/defs/examples/test_granite.py index 63084c99ae3..234a07c174c 100644 --- a/tests/integration/defs/examples/test_granite.py +++ b/tests/integration/defs/examples/test_granite.py @@ -14,6 +14,7 @@ # limitations under the License. import os +import time import pytest from defs.common import (convert_weights, test_multi_lora_support, @@ -96,7 +97,9 @@ def test_granite_bf16_lora(llama_example_root, "Run Granite 3.0 models with multiple dummy LoRAs." # TODO: Enable fp8 quantization when ModelOpt changes for Granite are available. + start_time = time.time() print("Converting checkpoint...") + convert_start = time.time() model_name = os.path.basename(llm_granite_model_root) dtype = 'bfloat16' @@ -108,6 +111,11 @@ def test_granite_bf16_lora(llama_example_root, model_path=llm_granite_model_root, data_type=dtype, ) + convert_end = time.time() + print( + f"Convert checkpoint completed in {(convert_end - convert_start):.2f} seconds." + ) + target_hf_modules = [ "q_proj", "k_proj", @@ -122,6 +130,8 @@ def test_granite_bf16_lora(llama_example_root, target_hf_modules += ["moe_h_to_4h", "moe_4h_to_h", "moe_gate"] target_trtllm_modules += ["moe_h_to_4h", "moe_4h_to_h", "moe_gate"] + print("Calling test_multi_lora_support...") + test_multi_lora_start = time.time() test_multi_lora_support( hf_model_dir=llm_granite_model_root, tllm_ckpt_dir=ckpt_dir, @@ -134,3 +144,10 @@ def test_granite_bf16_lora(llama_example_root, target_trtllm_modules=target_trtllm_modules, zero_lora_weights=True, ) + test_multi_lora_end = time.time() + print( + f"test_multi_lora_support completed in {(test_multi_lora_end - test_multi_lora_start):.2f} seconds" + ) + + total_time = time.time() - start_time + print(f"Total function execution time: {total_time:.2f} seconds") diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index 1ea66b87726..ca9a7a09e5b 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -106,6 +106,6 @@ l0_l40s: - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-2b-instruct] - - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] + - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] TIMEOUT (90) - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf] - examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 43889db226e..9e053676bdd 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -393,9 +393,7 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP ( examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086) test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234) -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5374145) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646) -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5376087) full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966) accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] SKIP (https://nvbugs/5377465)