@@ -2658,151 +2658,6 @@ def test_llava_onevision(
26582658 validate_by_keyword (output_result , keyword )
26592659
26602660
2661- @pytest .mark .parametrize ("E2E_MODEL_NAME" , ["ensemble" , "tensorrt_llm_bls" ])
2662- @pytest .mark .parametrize ("ACCUMULATE_TOKEN" , ["False" ])
2663- @pytest .mark .parametrize ("BLS_INSTANCE_COUNT" , ["1" ])
2664- @pytest .mark .parametrize ("PREPROCESSING_INSTANCE_COUNT" , ["1" ])
2665- @pytest .mark .parametrize ("POSTPROCESSING_INSTANCE_COUNT" , ["1" ])
2666- @pytest .mark .parametrize ("MAX_TOKENS_IN_KV_CACHE" , ["" ])
2667- @pytest .mark .parametrize ("MAX_ATTENTION_WINDOW_SIZE" , ["" ])
2668- @pytest .mark .parametrize ("BATCH_SCHEDULER_POLICY" ,
2669- ["max_utilization" , "guaranteed_no_evict" ])
2670- @pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.7" ])
2671- @pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
2672- ids = ["disableTrtOverlap" ])
2673- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
2674- @pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
2675- ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
2676- @pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
2677- @pytest .mark .parametrize ("MAX_QUEUE_DELAY_MICROSECONDS" , ["0" ])
2678- @pytest .mark .parametrize ("ENABLE_KV_CACHE_REUSE" , ["False" ])
2679- @pytest .mark .parametrize ("NORMALIZE_LOG_PROBS" , ["True" ])
2680- @pytest .mark .parametrize ("ENABLE_CHUNKED_CONTEXT" , ["False" ])
2681- @pytest .mark .parametrize ("GPU_DEVICE_IDS" , ["" ])
2682- @pytest .mark .parametrize ("DECODING_MODE" , ["" ])
2683- @pytest .mark .parametrize ("MAX_BEAM_WIDTH" , ["1" ])
2684- @pytest .mark .parametrize ("EXCLUDE_INPUT_IN_OUTPUT" , ["False" ])
2685- @pytest .mark .parametrize ("MAX_NUM_IMAGES" , ["4" ])
2686- def test_vila (
2687- E2E_MODEL_NAME ,
2688- MAX_TOKENS_IN_KV_CACHE ,
2689- MAX_ATTENTION_WINDOW_SIZE ,
2690- BATCH_SCHEDULER_POLICY ,
2691- KV_CACHE_FREE_GPU_MEM_FRACTION ,
2692- ENABLE_TRT_OVERLAP ,
2693- BATCHING_STRATEGY ,
2694- DECOUPLED_MODE ,
2695- TRITON_MAX_BATCH_SIZE ,
2696- MAX_QUEUE_DELAY_MICROSECONDS ,
2697- MAX_BEAM_WIDTH ,
2698- ENABLE_KV_CACHE_REUSE ,
2699- NORMALIZE_LOG_PROBS ,
2700- ENABLE_CHUNKED_CONTEXT ,
2701- GPU_DEVICE_IDS ,
2702- DECODING_MODE ,
2703- PREPROCESSING_INSTANCE_COUNT ,
2704- POSTPROCESSING_INSTANCE_COUNT ,
2705- ACCUMULATE_TOKEN ,
2706- BLS_INSTANCE_COUNT ,
2707- EXCLUDE_INPUT_IN_OUTPUT ,
2708- MAX_NUM_IMAGES ,
2709- tensorrt_llm_multimodal_example_root ,
2710- tensorrt_llm_llama_example_root ,
2711- vila_model_root ,
2712- vila_repo_root ,
2713- llm_backend_multimodal_example_root ,
2714- llm_backend_venv ,
2715- ):
2716- if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization" :
2717- pytest .skip ("Skipping. V1 doesn't support max_utilization." )
2718-
2719- if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True" :
2720- pytest .skip ("Skipping." )
2721-
2722- llm_backend_repo_root = os .environ ["LLM_BACKEND_ROOT" ]
2723-
2724- # install vila requirements
2725- requirements_vila = os .path .join (llm_backend_repo_root , "all_models" ,
2726- "multimodal" , "requirements-vila.txt" )
2727- check_call (f"pip install -r { requirements_vila } " , shell = True )
2728-
2729- # Build Engine
2730- ENGINE_PATH , MULTIMODAL_ENGINE_DIR = prepare_vila_engine (
2731- tensorrt_llm_multimodal_example_root , tensorrt_llm_llama_example_root ,
2732- vila_model_root , vila_repo_root )
2733- # Prepare model repo
2734- new_model_repo = os .path .join (llm_backend_repo_root , "triton_repo" )
2735- prepare_ib_model_repo (llm_backend_repo_root , new_model_repo )
2736-
2737- # Prepare multimodal specific repo
2738- prepare_multimodal_model_repo (llm_backend_repo_root , new_model_repo ,
2739- "ensemble" )
2740- prepare_multimodal_model_repo (llm_backend_repo_root , new_model_repo ,
2741- "multimodal_encoders" )
2742-
2743- # Modify config.pbtxt
2744- TOKENIZER_PATH = os .path .join (vila_model_root , "llm" )
2745- modify_ib_config_pbtxt (
2746- new_model_repo ,
2747- ENGINE_PATH ,
2748- TOKENIZER_PATH ,
2749- llm_backend_repo_root ,
2750- DECOUPLED_MODE ,
2751- MAX_TOKENS_IN_KV_CACHE ,
2752- MAX_ATTENTION_WINDOW_SIZE ,
2753- BATCH_SCHEDULER_POLICY ,
2754- BATCHING_STRATEGY ,
2755- KV_CACHE_FREE_GPU_MEM_FRACTION ,
2756- EXCLUDE_INPUT_IN_OUTPUT ,
2757- ENABLE_TRT_OVERLAP ,
2758- TRITON_MAX_BATCH_SIZE ,
2759- MAX_QUEUE_DELAY_MICROSECONDS ,
2760- MAX_BEAM_WIDTH ,
2761- ENABLE_KV_CACHE_REUSE ,
2762- NORMALIZE_LOG_PROBS ,
2763- ENABLE_CHUNKED_CONTEXT ,
2764- GPU_DEVICE_IDS ,
2765- DECODING_MODE ,
2766- PREPROCESSING_INSTANCE_COUNT ,
2767- POSTPROCESSING_INSTANCE_COUNT ,
2768- ACCUMULATE_TOKEN ,
2769- BLS_INSTANCE_COUNT ,
2770- MULTIMODAL_ENGINE_PATH = MULTIMODAL_ENGINE_DIR ,
2771- MAX_NUM_IMAGES = MAX_NUM_IMAGES ,
2772- )
2773-
2774- # Launch Triton Server
2775- launch_server_py = os .path .join (llm_backend_repo_root , "scripts" ,
2776- "launch_triton_server.py" )
2777-
2778- # NOTE
2779- # Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
2780- check_call (
2781- f"PMIX_MCA_gds=hash python3 { launch_server_py } --world_size=1 --model_repo={ new_model_repo } " ,
2782- shell = True )
2783- check_server_ready ()
2784- # Run Test
2785-
2786- text_prompt = "<image>\n Please elaborate what you see in the image?"
2787- run_cmd = [
2788- f"{ llm_backend_multimodal_example_root } /client.py" ,
2789- "--model_type=vila" ,
2790- f"--hf_model_dir={ vila_model_root } " ,
2791- f"--text='{ text_prompt } '" ,
2792- ]
2793- if DECOUPLED_MODE == "True" :
2794- run_cmd += [
2795- "--streaming" ,
2796- ]
2797-
2798- if E2E_MODEL_NAME == "tensorrt_llm_bls" :
2799- run_cmd += [
2800- "--use_bls" ,
2801- ]
2802-
2803- venv_check_call (llm_backend_venv , run_cmd )
2804-
2805-
28062661@pytest .mark .skip_less_device_memory (80000 )
28072662@pytest .mark .parametrize ("E2E_MODEL_NAME" , ["ensemble" ])
28082663@pytest .mark .parametrize ("ACCUMULATE_TOKEN" , ["False" ])
0 commit comments