diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 67781cd8d15..5d5adcbf9b4 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -45,6 +45,14 @@ microsoft/Phi-3.5-mini-instruct: - accuracy: 31.354 microsoft/Phi-4-mini-instruct: - accuracy: 32.921 +bigcode/starcoder2-7b: + - accuracy: 26.611 + - quant_algo: FP8 + accuracy: 26.611 +mistralai/Codestral-22B-v0.1: + - accuracy: 30.316 + - quant_algo: FP8 + accuracy: 30.316 state-spaces/mamba-130m-hf: - accuracy: 19.470 lmsys/vicuna-7b-v1.3: diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 26de82cbc09..36ab9ae5997 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -159,6 +159,8 @@ microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 75.85 microsoft/Phi-4-mini-instruct: - accuracy: 82.30 +mistralai/Codestral-22B-v0.1: + - accuracy: 67.10 GPT-OSS/BF16: - accuracy: 90.3 GPT-OSS/MXFP4: diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 7f2bb55e6f7..c8bacfeaf46 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,6 +229,14 @@ nvidia/Nemotron-H-56B-Base-8K: accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 +bigcode/starcoder2-7b: + - accuracy: 41.35 + - quant_algo: FP8 + accuracy: 41.35 +mistralai/Codestral-22B-v0.1: + - accuracy: 61.72 + - quant_algo: FP8 + accuracy: 61.72 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. # TODO: update once https://nvbugs/5393849 is fixed. microsoft/Phi-4-mini-instruct-tp2: diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index f34bcdb5be4..321591228bd 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -433,3 +433,55 @@ def test_auto_dtype(self): speculative_config=self.speculative_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + + +class TestStarCoder2_7B(LlmapiAccuracyTestHarness): + MODEL_NAME = "bigcode/starcoder2-7b" + MODEL_PATH = f"{llm_models_root()}/starcoder2-7b" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + @pytest.mark.skip_less_device_memory(70000) + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @pytest.mark.skip_less_device_memory(70000) + def test_fp8(self): + quant_config = QuantConfig(QuantAlgo.FP8) + with LLM(self.MODEL_PATH, + quant_config=quant_config, + kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + +class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Codestral-22B-v0.1" + MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @pytest.mark.skip_less_device_memory(80000) + def test_fp8(self): + quant_config = QuantConfig(QuantAlgo.FP8) + with LLM(self.MODEL_PATH, + quant_config=quant_config, + kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 89483fd2620..b3c1c50ca5d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2431,6 +2431,22 @@ def test_auto_dtype(self): task.evaluate(llm) +class TestCodestral_22B_V01(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Codestral-22B-v0.1" + MODEL_PATH = f"{llm_models_root()}/Codestral-22B-v0.1" + + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestKanana_Instruct(LlmapiAccuracyTestHarness): MODEL_NAME = "kanana-1.5-2.1b-instruct-2505" MODEL_PATH = f"{llm_models_root()}/kanana-1.5-2.1b-instruct-2505" diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 4fe1fd3ab00..90b6406806b 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -21,3 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] +accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype +accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8 +accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype +accuracy/test_llm_api.py::TestCodestral_22B_V01::test_fp8 +accuracy/test_llm_api_pytorch.py::TestCodestral_22B_V01::test_auto_dtype