diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 98c15dc1c33..b2a469fdbfd 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1527,6 +1527,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "A10-PyTorch-1": ["a10", "l0_a10", 1, 1], "A10-CPP-1": ["a10", "l0_a10", 1, 1], + "A10-TensorRT-1": ["a10", "l0_a10", 1, 6], + "A10-TensorRT-2": ["a10", "l0_a10", 2, 6], + "A10-TensorRT-3": ["a10", "l0_a10", 3, 6], + "A10-TensorRT-4": ["a10", "l0_a10", 4, 6], + "A10-TensorRT-5": ["a10", "l0_a10", 5, 6], + "A10-TensorRT-6": ["a10", "l0_a10", 6, 6], "A30-Triton-1": ["a30", "l0_a30", 1, 1], "A30-PyTorch-1": ["a30", "l0_a30", 1, 2], "A30-PyTorch-2": ["a30", "l0_a30", 2, 2], @@ -1538,19 +1544,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "H100_PCIe-PyTorch-2": ["h100-cr", "l0_h100", 2, 3], "H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 3], "H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1], + "H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 2], + "H100_PCIe-TensorRT-2": ["h100-cr", "l0_h100", 2, 2], "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 2], "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 2], + "B200_PCIe-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2], + "B200_PCIe-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2], "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1], + "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], + "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], // Currently post-merge test stages only run tests with "stage: post_merge" mako // in the test-db. This behavior may change in the future. - "A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 8], - "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 8], - "A10-TensorRT-[Post-Merge]-3": ["a10", "l0_a10", 3, 8], - "A10-TensorRT-[Post-Merge]-4": ["a10", "l0_a10", 4, 8], - "A10-TensorRT-[Post-Merge]-5": ["a10", "l0_a10", 5, 8], - "A10-TensorRT-[Post-Merge]-6": ["a10", "l0_a10", 6, 8], - "A10-TensorRT-[Post-Merge]-7": ["a10", "l0_a10", 7, 8], - "A10-TensorRT-[Post-Merge]-8": ["a10", "l0_a10", 8, 8], + "A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 2], + "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2], "A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 6], "A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 6], "A30-TensorRT-[Post-Merge]-3": ["a30", "l0_a30", 3, 6], @@ -1575,18 +1581,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "L40S-TensorRT-[Post-Merge]-5": ["l40s", "l0_l40s", 5, 5], "H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1], - "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 7], - "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 7], - "H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 7], - "H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 7], - "H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 7], - "H100_PCIe-TensorRT-[Post-Merge]-6": ["h100-cr", "l0_h100", 6, 7], - "H100_PCIe-TensorRT-[Post-Merge]-7": ["h100-cr", "l0_h100", 7, 7], + "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 5], + "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 5], + "H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 5], + "H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 5], + "H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 5], "B200_PCIe-Triton-Python-[Post-Merge]-1": ["b100-ts2", "l0_b200", 1, 1], - "B200_PCIe-[Post-Merge]-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2], - "B200_PCIe-[Post-Merge]-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2], - "RTX5080-[Post-Merge]-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], - "RTX5080-[Post-Merge]-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1], "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1], "DGX_H200-8_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8], diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 0df8945aa42..1260a3e259f 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -51,7 +51,7 @@ l0_a10: - '*a10*' linux_distribution_name: ubuntu* terms: - stage: post_merge + stage: pre_merge backend: tensorrt tests: # ------------- TRT tests --------------- diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 2b3ce352c2b..2c963bc1e66 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -64,7 +64,7 @@ l0_b200: - '*b100*' linux_distribution_name: ubuntu* terms: - stage: post_merge + stage: pre_merge backend: tensorrt tests: # ------------- TRT tests --------------- diff --git a/tests/integration/test_lists/test-db/l0_gb203.yml b/tests/integration/test_lists/test-db/l0_gb203.yml index 5f1754031a2..dda30e58fd5 100644 --- a/tests/integration/test_lists/test-db/l0_gb203.yml +++ b/tests/integration/test_lists/test-db/l0_gb203.yml @@ -10,7 +10,7 @@ l0_gb203: - '*gb203*' linux_distribution_name: ubuntu* terms: - stage: post_merge + stage: pre_merge backend: tensorrt tests: # ------------- TRT tests --------------- diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 9d49bbb2c7a..901d9164cda 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -120,7 +120,7 @@ l0_h100: - '*h100*' linux_distribution_name: ubuntu* terms: - stage: post_merge + stage: pre_merge backend: tensorrt tests: # ------------- TRT tests --------------- @@ -129,52 +129,30 @@ l0_h100: - unittest/trt/quantization/test_weight_only_quant_matmul.py - unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py - test_e2e.py::test_trtllm_bench_sanity[-extra_config-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - - test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_sanity[--streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - - test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_latency_sanity[FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-] - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test - test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - - test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B] - - accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins - - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins - - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b] - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b] - examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] - examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] - examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] - - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] - - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100 - - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100 - - unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100 - unittest/trt/model/eagle # 1 mins on H100 - unittest/test_model_runner_cpp.py - test_cache.py::test_cache_sanity # 1 sec - unittest/llmapi/test_llm_quant.py # 5.5 mins on H100 - test_e2e.py::test_mistral_large_hidden_vocab_size - llmapi/test_llm_examples.py::test_llmapi_quickstart_atexit - - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] - - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] - unittest/trt/attention/test_gpt_attention_IFB.py - - unittest/trt/attention/test_gpt_attention_no_cache.py - - unittest/trt/model/test_mamba.py # 3 mins - - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] - - examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] - accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin - - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] - - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] - - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] - - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] - examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins - - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins - - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins - condition: ranges: system_gpu_count: @@ -300,3 +278,25 @@ l0_h100: - test_e2e.py::test_build_time_benchmark_sanity - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype + - unittest/trt/model/test_mamba.py # 3 mins + - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] + - examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] + - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100 + - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100 + - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin + - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] + - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] + - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] + - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins + - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins + - test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] + - test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] + - test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] + - accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins + - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins + - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins + - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] + - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] + - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] + - unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100 + - unittest/trt/attention/test_gpt_attention_no_cache.py