Skip to content

Commit 7de90a6

Browse files
authored
Remove vila test (#4376)
Signed-off-by: Iman Tabrizian <[email protected]>
1 parent ddf01f6 commit 7de90a6

File tree

5 files changed

+0
-247
lines changed

5 files changed

+0
-247
lines changed

tests/integration/defs/triton_server/build_engines.py

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -575,67 +575,6 @@ def prepare_llava_onevision_engine(tensorrt_llm_multimodal_example_root,
575575
return engine_dir, multimodal_engine_dir
576576

577577

578-
def prepare_vila_engine(tensorrt_llm_multimodal_example_root,
579-
tensorrt_llm_llama_example_root, vila_model_root,
580-
vila_repo_root):
581-
# Convert LLAMA from HF
582-
ckpt_dir = os.path.join(tensorrt_llm_multimodal_example_root, "model_dir",
583-
"vila")
584-
convert_cmd = [
585-
"python3",
586-
f"{tensorrt_llm_llama_example_root}/convert_checkpoint.py",
587-
f"--model_dir={vila_model_root}",
588-
f"--output_dir={ckpt_dir}",
589-
"--dtype=float16",
590-
]
591-
592-
# Build VILA
593-
engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
594-
"engine_dir", "vila")
595-
596-
build_cmd = [
597-
"trtllm-build",
598-
f"--checkpoint_dir={ckpt_dir}",
599-
"--gemm_plugin=float16",
600-
"--max_batch_size=8",
601-
"--max_multimodal_len=6272",
602-
"--max_input_len=2048",
603-
"--max_seq_len=2560",
604-
f"--output_dir={engine_dir}",
605-
]
606-
607-
multimodal_engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
608-
"tmp", "trt_engines", "VILA1.5-3b",
609-
"multimodal_encoder")
610-
611-
build_visual_engine_cmd = [
612-
"python3", "build_multimodal_engine.py", "--model_type=vila",
613-
f"--model_path={vila_model_root}", "--max_batch_size=32",
614-
f"--vila_path={vila_repo_root}", f"--output_dir={multimodal_engine_dir}"
615-
]
616-
617-
append_timing_cache_args(build_cmd)
618-
convert_cmd = " ".join(convert_cmd)
619-
build_cmd = " ".join(build_cmd)
620-
build_visual_engine_cmd = " ".join(build_visual_engine_cmd)
621-
if not os.path.exists(engine_dir):
622-
check_call(convert_cmd, shell=True)
623-
check_call(build_cmd, shell=True)
624-
check_call(build_visual_engine_cmd,
625-
shell=True,
626-
cwd=tensorrt_llm_multimodal_example_root)
627-
else:
628-
print_info(f"Reusing engine: {engine_dir}")
629-
print_info(f"Skipped: {convert_cmd}")
630-
print_info(f"Skipped: {build_cmd}")
631-
print_info(f"Skipped: {build_visual_engine_cmd}")
632-
633-
assert os.path.exists(engine_dir), f"{engine_dir} does not exists."
634-
assert os.path.exists(
635-
multimodal_engine_dir), f"{multimodal_engine_dir} does not exists."
636-
return engine_dir, multimodal_engine_dir
637-
638-
639578
def prepare_mllama_engine(tensorrt_llm_multimodal_example_root,
640579
tensorrt_llm_mllama_example_root, mllama_model_root,
641580
llm_backend_root):

tests/integration/defs/triton_server/conftest.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -465,30 +465,6 @@ def llava_model_root():
465465
return llava_model_root
466466

467467

468-
@pytest.fixture(scope="session")
469-
def vila_model_root():
470-
models_root = llm_models_root()
471-
assert models_root, "Did you set LLM_MODELS_ROOT?"
472-
vila_model_root = os.path.join(models_root, "vila", "VILA1.5-3b")
473-
474-
assert os.path.exists(
475-
vila_model_root
476-
), f"{vila_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
477-
return vila_model_root
478-
479-
480-
@pytest.fixture(scope="session")
481-
def vila_repo_root():
482-
models_root = llm_models_root()
483-
assert models_root, "Did you set LLM_MODELS_ROOT?"
484-
vila_repo_root = os.path.join(models_root, "vila", "VILA")
485-
486-
assert os.path.exists(
487-
vila_repo_root
488-
), f"{vila_repo_root} does not exist under NFS LLM_MODELS_ROOT dir"
489-
return vila_repo_root
490-
491-
492468
@pytest.fixture(scope="session")
493469
def mllama_model_root():
494470
models_root = llm_models_root()

tests/integration/defs/triton_server/test_triton_llm.py

Lines changed: 0 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -2658,151 +2658,6 @@ def test_llava_onevision(
26582658
validate_by_keyword(output_result, keyword)
26592659

26602660

2661-
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
2662-
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
2663-
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
2664-
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
2665-
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
2666-
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
2667-
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
2668-
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
2669-
["max_utilization", "guaranteed_no_evict"])
2670-
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
2671-
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
2672-
ids=["disableTrtOverlap"])
2673-
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching", "V1"])
2674-
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
2675-
ids=["enableDecoupleMode", "disableDecoupleMode"])
2676-
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
2677-
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
2678-
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
2679-
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
2680-
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
2681-
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
2682-
@pytest.mark.parametrize("DECODING_MODE", [""])
2683-
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
2684-
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
2685-
@pytest.mark.parametrize("MAX_NUM_IMAGES", ["4"])
2686-
def test_vila(
2687-
E2E_MODEL_NAME,
2688-
MAX_TOKENS_IN_KV_CACHE,
2689-
MAX_ATTENTION_WINDOW_SIZE,
2690-
BATCH_SCHEDULER_POLICY,
2691-
KV_CACHE_FREE_GPU_MEM_FRACTION,
2692-
ENABLE_TRT_OVERLAP,
2693-
BATCHING_STRATEGY,
2694-
DECOUPLED_MODE,
2695-
TRITON_MAX_BATCH_SIZE,
2696-
MAX_QUEUE_DELAY_MICROSECONDS,
2697-
MAX_BEAM_WIDTH,
2698-
ENABLE_KV_CACHE_REUSE,
2699-
NORMALIZE_LOG_PROBS,
2700-
ENABLE_CHUNKED_CONTEXT,
2701-
GPU_DEVICE_IDS,
2702-
DECODING_MODE,
2703-
PREPROCESSING_INSTANCE_COUNT,
2704-
POSTPROCESSING_INSTANCE_COUNT,
2705-
ACCUMULATE_TOKEN,
2706-
BLS_INSTANCE_COUNT,
2707-
EXCLUDE_INPUT_IN_OUTPUT,
2708-
MAX_NUM_IMAGES,
2709-
tensorrt_llm_multimodal_example_root,
2710-
tensorrt_llm_llama_example_root,
2711-
vila_model_root,
2712-
vila_repo_root,
2713-
llm_backend_multimodal_example_root,
2714-
llm_backend_venv,
2715-
):
2716-
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
2717-
pytest.skip("Skipping. V1 doesn't support max_utilization.")
2718-
2719-
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
2720-
pytest.skip("Skipping.")
2721-
2722-
llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
2723-
2724-
# install vila requirements
2725-
requirements_vila = os.path.join(llm_backend_repo_root, "all_models",
2726-
"multimodal", "requirements-vila.txt")
2727-
check_call(f"pip install -r {requirements_vila}", shell=True)
2728-
2729-
# Build Engine
2730-
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_vila_engine(
2731-
tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
2732-
vila_model_root, vila_repo_root)
2733-
# Prepare model repo
2734-
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
2735-
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
2736-
2737-
# Prepare multimodal specific repo
2738-
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
2739-
"ensemble")
2740-
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
2741-
"multimodal_encoders")
2742-
2743-
# Modify config.pbtxt
2744-
TOKENIZER_PATH = os.path.join(vila_model_root, "llm")
2745-
modify_ib_config_pbtxt(
2746-
new_model_repo,
2747-
ENGINE_PATH,
2748-
TOKENIZER_PATH,
2749-
llm_backend_repo_root,
2750-
DECOUPLED_MODE,
2751-
MAX_TOKENS_IN_KV_CACHE,
2752-
MAX_ATTENTION_WINDOW_SIZE,
2753-
BATCH_SCHEDULER_POLICY,
2754-
BATCHING_STRATEGY,
2755-
KV_CACHE_FREE_GPU_MEM_FRACTION,
2756-
EXCLUDE_INPUT_IN_OUTPUT,
2757-
ENABLE_TRT_OVERLAP,
2758-
TRITON_MAX_BATCH_SIZE,
2759-
MAX_QUEUE_DELAY_MICROSECONDS,
2760-
MAX_BEAM_WIDTH,
2761-
ENABLE_KV_CACHE_REUSE,
2762-
NORMALIZE_LOG_PROBS,
2763-
ENABLE_CHUNKED_CONTEXT,
2764-
GPU_DEVICE_IDS,
2765-
DECODING_MODE,
2766-
PREPROCESSING_INSTANCE_COUNT,
2767-
POSTPROCESSING_INSTANCE_COUNT,
2768-
ACCUMULATE_TOKEN,
2769-
BLS_INSTANCE_COUNT,
2770-
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
2771-
MAX_NUM_IMAGES=MAX_NUM_IMAGES,
2772-
)
2773-
2774-
# Launch Triton Server
2775-
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
2776-
"launch_triton_server.py")
2777-
2778-
# NOTE
2779-
# Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
2780-
check_call(
2781-
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
2782-
shell=True)
2783-
check_server_ready()
2784-
# Run Test
2785-
2786-
text_prompt = "<image>\nPlease elaborate what you see in the image?"
2787-
run_cmd = [
2788-
f"{llm_backend_multimodal_example_root}/client.py",
2789-
"--model_type=vila",
2790-
f"--hf_model_dir={vila_model_root}",
2791-
f"--text='{text_prompt}'",
2792-
]
2793-
if DECOUPLED_MODE == "True":
2794-
run_cmd += [
2795-
"--streaming",
2796-
]
2797-
2798-
if E2E_MODEL_NAME == "tensorrt_llm_bls":
2799-
run_cmd += [
2800-
"--use_bls",
2801-
]
2802-
2803-
venv_check_call(llm_backend_venv, run_cmd)
2804-
2805-
28062661
@pytest.mark.skip_less_device_memory(80000)
28072662
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
28082663
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])

tests/integration/test_lists/test-db/l0_a30.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ l0_a30:
201201
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
202202
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
203203
- triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
204-
- triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
205204
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
206205
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
207206
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -465,22 +465,6 @@ unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" SKIP (https://nv
465465
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
466466
unittest/_torch/speculative/test_eagle3.py SKIP (https://nvbugs/5280806)
467467
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image] SKIP (https://nvbugs/5226211)
468-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/4931591)
469-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
470-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
471-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
472-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
473-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
474-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
475-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
476-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
477-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
478-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
479-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
480-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
481-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
482-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
483-
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
484468
triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
485469
triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
486470
triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP

0 commit comments

Comments
 (0)