NVIDIA · Tabrizian · May 19, 2025 · May 16, 2025
diff --git a/tests/integration/defs/triton_server/build_engines.py b/tests/integration/defs/triton_server/build_engines.py
@@ -575,67 +575,6 @@ def prepare_llava_onevision_engine(tensorrt_llm_multimodal_example_root,
     return engine_dir, multimodal_engine_dir
 
 
-def prepare_vila_engine(tensorrt_llm_multimodal_example_root,
-                        tensorrt_llm_llama_example_root, vila_model_root,
-                        vila_repo_root):
-    # Convert LLAMA from HF
-    ckpt_dir = os.path.join(tensorrt_llm_multimodal_example_root, "model_dir",
-                            "vila")
-    convert_cmd = [
-        "python3",
-        f"{tensorrt_llm_llama_example_root}/convert_checkpoint.py",
-        f"--model_dir={vila_model_root}",
-        f"--output_dir={ckpt_dir}",
-        "--dtype=float16",
-    ]
-
-    # Build VILA
-    engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
-                              "engine_dir", "vila")
-
-    build_cmd = [
-        "trtllm-build",
-        f"--checkpoint_dir={ckpt_dir}",
-        "--gemm_plugin=float16",
-        "--max_batch_size=8",
-        "--max_multimodal_len=6272",
-        "--max_input_len=2048",
-        "--max_seq_len=2560",
-        f"--output_dir={engine_dir}",
-    ]
-
-    multimodal_engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
-                                         "tmp", "trt_engines", "VILA1.5-3b",
-                                         "multimodal_encoder")
-
-    build_visual_engine_cmd = [
-        "python3", "build_multimodal_engine.py", "--model_type=vila",
-        f"--model_path={vila_model_root}", "--max_batch_size=32",
-        f"--vila_path={vila_repo_root}", f"--output_dir={multimodal_engine_dir}"
-    ]
-
-    append_timing_cache_args(build_cmd)
-    convert_cmd = " ".join(convert_cmd)
-    build_cmd = " ".join(build_cmd)
-    build_visual_engine_cmd = " ".join(build_visual_engine_cmd)
-    if not os.path.exists(engine_dir):
-        check_call(convert_cmd, shell=True)
-        check_call(build_cmd, shell=True)
-        check_call(build_visual_engine_cmd,
-                   shell=True,
-                   cwd=tensorrt_llm_multimodal_example_root)
-    else:
-        print_info(f"Reusing engine: {engine_dir}")
-        print_info(f"Skipped: {convert_cmd}")
-        print_info(f"Skipped: {build_cmd}")
-        print_info(f"Skipped: {build_visual_engine_cmd}")
-
-    assert os.path.exists(engine_dir), f"{engine_dir} does not exists."
-    assert os.path.exists(
-        multimodal_engine_dir), f"{multimodal_engine_dir} does not exists."
-    return engine_dir, multimodal_engine_dir
-
-
 def prepare_mllama_engine(tensorrt_llm_multimodal_example_root,
                           tensorrt_llm_mllama_example_root, mllama_model_root,
                           llm_backend_root):

diff --git a/tests/integration/defs/triton_server/conftest.py b/tests/integration/defs/triton_server/conftest.py
@@ -465,30 +465,6 @@ def llava_model_root():
     return llava_model_root
 
 
-@pytest.fixture(scope="session")
-def vila_model_root():
-    models_root = llm_models_root()
-    assert models_root, "Did you set LLM_MODELS_ROOT?"
-    vila_model_root = os.path.join(models_root, "vila", "VILA1.5-3b")
-
-    assert os.path.exists(
-        vila_model_root
-    ), f"{vila_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
-    return vila_model_root
-
-
-@pytest.fixture(scope="session")
-def vila_repo_root():
-    models_root = llm_models_root()
-    assert models_root, "Did you set LLM_MODELS_ROOT?"
-    vila_repo_root = os.path.join(models_root, "vila", "VILA")
-
-    assert os.path.exists(
-        vila_repo_root
-    ), f"{vila_repo_root} does not exist under NFS LLM_MODELS_ROOT dir"
-    return vila_repo_root
-
-
 @pytest.fixture(scope="session")
 def mllama_model_root():
     models_root = llm_models_root()

diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -2658,151 +2658,6 @@ def test_llava_onevision(
     validate_by_keyword(output_result, keyword)
 
 
-@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
-@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
-@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
-@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
-@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
-@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
-@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
-@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
-                         ["max_utilization", "guaranteed_no_evict"])
-@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
-@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
-                         ids=["disableTrtOverlap"])
-@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching", "V1"])
-@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
-                         ids=["enableDecoupleMode", "disableDecoupleMode"])
-@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
-@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
-@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
-@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
-@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
-@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
-@pytest.mark.parametrize("DECODING_MODE", [""])
-@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
-@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
-@pytest.mark.parametrize("MAX_NUM_IMAGES", ["4"])
-def test_vila(
-    E2E_MODEL_NAME,
-    MAX_TOKENS_IN_KV_CACHE,
-    MAX_ATTENTION_WINDOW_SIZE,
-    BATCH_SCHEDULER_POLICY,
-    KV_CACHE_FREE_GPU_MEM_FRACTION,
-    ENABLE_TRT_OVERLAP,
-    BATCHING_STRATEGY,
-    DECOUPLED_MODE,
-    TRITON_MAX_BATCH_SIZE,
-    MAX_QUEUE_DELAY_MICROSECONDS,
-    MAX_BEAM_WIDTH,
-    ENABLE_KV_CACHE_REUSE,
-    NORMALIZE_LOG_PROBS,
-    ENABLE_CHUNKED_CONTEXT,
-    GPU_DEVICE_IDS,
-    DECODING_MODE,
-    PREPROCESSING_INSTANCE_COUNT,
-    POSTPROCESSING_INSTANCE_COUNT,
-    ACCUMULATE_TOKEN,
-    BLS_INSTANCE_COUNT,
-    EXCLUDE_INPUT_IN_OUTPUT,
-    MAX_NUM_IMAGES,
-    tensorrt_llm_multimodal_example_root,
-    tensorrt_llm_llama_example_root,
-    vila_model_root,
-    vila_repo_root,
-    llm_backend_multimodal_example_root,
-    llm_backend_venv,
-):
-    if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
-        pytest.skip("Skipping. V1 doesn't support max_utilization.")
-
-    if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
-        pytest.skip("Skipping.")
-
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
-
-    # install vila requirements
-    requirements_vila = os.path.join(llm_backend_repo_root, "all_models",
-                                     "multimodal", "requirements-vila.txt")
-    check_call(f"pip install -r {requirements_vila}", shell=True)
-
-    # Build Engine
-    ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_vila_engine(
-        tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
-        vila_model_root, vila_repo_root)
-    # Prepare model repo
-    new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
-    prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
-
-    # Prepare multimodal specific repo
-    prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
-                                  "ensemble")
-    prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
-                                  "multimodal_encoders")
-
-    # Modify config.pbtxt
-    TOKENIZER_PATH = os.path.join(vila_model_root, "llm")
-    modify_ib_config_pbtxt(
-        new_model_repo,
-        ENGINE_PATH,
-        TOKENIZER_PATH,
-        llm_backend_repo_root,
-        DECOUPLED_MODE,
-        MAX_TOKENS_IN_KV_CACHE,
-        MAX_ATTENTION_WINDOW_SIZE,
-        BATCH_SCHEDULER_POLICY,
-        BATCHING_STRATEGY,
-        KV_CACHE_FREE_GPU_MEM_FRACTION,
-        EXCLUDE_INPUT_IN_OUTPUT,
-        ENABLE_TRT_OVERLAP,
-        TRITON_MAX_BATCH_SIZE,
-        MAX_QUEUE_DELAY_MICROSECONDS,
-        MAX_BEAM_WIDTH,
-        ENABLE_KV_CACHE_REUSE,
-        NORMALIZE_LOG_PROBS,
-        ENABLE_CHUNKED_CONTEXT,
-        GPU_DEVICE_IDS,
-        DECODING_MODE,
-        PREPROCESSING_INSTANCE_COUNT,
-        POSTPROCESSING_INSTANCE_COUNT,
-        ACCUMULATE_TOKEN,
-        BLS_INSTANCE_COUNT,
-        MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
-        MAX_NUM_IMAGES=MAX_NUM_IMAGES,
-    )
-
-    # Launch Triton Server
-    launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
-                                    "launch_triton_server.py")
-
-    # NOTE
-    # Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
-    check_call(
-        f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
-        shell=True)
-    check_server_ready()
-    # Run Test
-
-    text_prompt = "<image>\nPlease elaborate what you see in the image?"
-    run_cmd = [
-        f"{llm_backend_multimodal_example_root}/client.py",
-        "--model_type=vila",
-        f"--hf_model_dir={vila_model_root}",
-        f"--text='{text_prompt}'",
-    ]
-    if DECOUPLED_MODE == "True":
-        run_cmd += [
-            "--streaming",
-        ]
-
-        if E2E_MODEL_NAME == "tensorrt_llm_bls":
-            run_cmd += [
-                "--use_bls",
-            ]
-
-    venv_check_call(llm_backend_venv, run_cmd)
-
-
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
 @pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])

diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -201,7 +201,6 @@ l0_a30:
   - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
   - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
   - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -465,22 +465,6 @@ unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" SKIP (https://nv
 examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
 unittest/_torch/speculative/test_eagle3.py SKIP (https://nvbugs/5280806)
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image] SKIP (https://nvbugs/5226211)
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/4931591)
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
-triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
 triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
 triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
 triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP