@@ -751,6 +751,24 @@ def test_auto_dtype(self):
751751 task = GSM8K (self .MODEL_NAME )
752752 task .evaluate (llm )
753753
754+ def test_fp8_prequantized (self ):
755+ # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size.
756+ kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
757+ enable_partial_reuse = False ,
758+ dtype = "fp8" )
759+ prequantized_model_path = f"{ llm_models_root ()} /gemma/gemma-3-27b-it-fp8/"
760+ with LLM (prequantized_model_path ,
761+ kv_cache_config = kv_cache_config ,
762+ attn_backend = "FLASHINFER" ,
763+ cuda_graph_config = None ) as llm :
764+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
765+ task = CnnDailymail (self .MODEL_NAME )
766+ task .evaluate (llm )
767+ task = GSM8K (self .MODEL_NAME )
768+ task .evaluate (llm )
769+ task = MMLU (self .MODEL_NAME )
770+ task .evaluate (llm )
771+
754772
755773class TestGemma3_1BInstruct (LlmapiAccuracyTestHarness ):
756774 MODEL_NAME = "google/gemma-3-1b-it"
@@ -784,6 +802,8 @@ def test_fp8_prequantized(self):
784802 assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
785803 task = CnnDailymail (self .MODEL_NAME )
786804 task .evaluate (llm )
805+ task = GSM8K (self .MODEL_NAME )
806+ task .evaluate (llm )
787807 task = MMLU (self .MODEL_NAME )
788808 task .evaluate (llm )
789809
0 commit comments