vllm-project · comaniac · Aug 12, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst
@@ -45,5 +45,3 @@ Here is an example of how to enable this feature:
         # output w/ scaling factors:  England, the United Kingdom, and one of the world's leading financial,
         # output w/o scaling factors:  England, located in the southeastern part of the country. It is known 
 
-Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
-
diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
@@ -32,5 +32,3 @@ Here is an example of how to enable this feature:
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
-
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,14 +6,27 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+
 import pytest
 
-from ..models.utils import check_outputs_equal
+from ..models.utils import check_logprobs_close, check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
+E5M2_KV_MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-chat-hf",
+]
+E4M3_KV_MODELS = [
+    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+]
+KV_CACHE_QUANTIZATION_PATHS = {
+    "meta-llama/Llama-2-7b-chat-hf":
+    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
+}
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -35,12 +48,12 @@ def test_models(
     enforce_eager: bool,
     tensor_parallel_size: int,
 ) -> None:
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
+    """
+    Checks exact match decode between huggingface model and vllm runner with
+    chunked prefill.
+    """
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
 
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -49,7 +62,7 @@ def test_models(
             model,
             dtype=dtype,
             max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
+            enable_chunked_prefill=True,
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
@@ -62,3 +75,78 @@ def test_models(
         name_0="hf",
         name_1="vllm",
     )
+
+
+@pytest.mark.parametrize("kv_cache_dtype,model",
+                         [("fp8_e5m2", m)
+                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
+                                                      for m in E4M3_KV_MODELS])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_models_with_fp8_kv_cache(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    model: str,
+    max_tokens: int,
+    chunked_prefill_token_size: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    """
+    Only checks log probs match between chunked-prefill and
+    non-chunked-prefill version of vLLM model runner.
+
+    This test is used when there is discrepancy in kernels
+    / numerics (e.g. when using lower-precision types like FP8).
+    """
+    NUM_LOG_PROBS = 8
+
+    if model == "facebook/opt-125m":
+        pytest.skip(
+            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
+        )
+
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    extra_kwargs = {}
+    if model in KV_CACHE_QUANTIZATION_PATHS:
+        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
+            model]
+
+    with vllm_runner(
+            model,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            **extra_kwargs,
+    ) as vllm_model:
+        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+            **extra_kwargs,
+    ) as vllm_model:
+        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=no_chunked_prefill_outputs,
+        outputs_1_lst=chunked_prefill_outputs,
+        name_0="no_chunked_prefill",
+        name_1="chunked_prefill",
+    )
@@ -9,6 +9,7 @@
 
 from vllm.attention.backends.xformers import _make_alibi_bias
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
@@ -18,12 +19,14 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
 @torch.inference_mode()
@@ -33,6 +36,7 @@ def test_contexted_kv_attention(
     head_size: int,
     sliding_window: int,
     dtype: torch.dtype,
+    kv_cache_dtype: str,
     device: str,
 ) -> None:
     random.seed(0)
@@ -67,16 +71,20 @@ def test_contexted_kv_attention(
     kv.uniform_(-1e-3, 1e-3)
     key, value = kv.unbind(dim=1)
 
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
     k_cache = torch.zeros(cache_size,
                           block_size,
                           num_kv_heads,
                           head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
     v_cache = torch.zeros(cache_size,
                           block_size,
                           num_kv_heads,
                           head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
@@ -132,6 +140,7 @@ def test_contexted_kv_attention(
                           k,
                           v,
                           output,
+                          kv_cache_dtype,
                           k_cache,
                           v_cache,
                           block_table,
@@ -146,6 +155,7 @@ def test_contexted_kv_attention(
                           k,
                           v,
                           output,
+                          kv_cache_dtype,
                           k_cache,
                           v_cache,
                           block_table,
@@ -208,20 +218,23 @@ def test_contexted_kv_attention(
     end_time = time.time()
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
     output_ref = output_ref.reshape(output.shape)
-    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi(
     num_heads: int,
     num_queries_per_kv: int,
     head_size: int,
     dtype: torch.dtype,
+    kv_cache_dtype: str,
     device: str,
 ) -> None:
     random.seed(0)
@@ -282,17 +295,20 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
     kv.uniform_(-1e-3, 1e-3)
     key, value = kv.unbind(dim=1)
-
+    if kv_cache_dtype == "auto":
+        cache_dtype = dtype
+    else:
+        cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
     k_cache = torch.zeros(cache_size,
                           block_size,
                           num_kv_heads,
                           head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
     v_cache = torch.zeros(cache_size,
                           block_size,
                           num_kv_heads,
                           head_size,
-                          dtype=dtype)
+                          dtype=cache_dtype)
     k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
@@ -348,6 +364,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           k,
                           v,
                           output,
+                          kv_cache_dtype,
                           k_cache,
                           v_cache,
                           block_table,
@@ -362,6 +379,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           k,
                           v,
                           output,
+                          kv_cache_dtype,
                           k_cache,
                           v_cache,
                           block_table,
@@ -447,4 +465,5 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
-    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
@@ -459,6 +459,7 @@ def forward(
                     query,
                     key,
                     value,
+                    self.kv_cache_dtype,
                     key_cache,
                     value_cache,
                     prefill_meta.block_tables,
@@ -468,6 +469,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
+                    k_scale,
+                    v_scale,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:

@@ -604,6 +604,7 @@ def forward(
                     query,
                     key,
                     value,
+                    self.kv_cache_dtype,
                     key_cache,
                     value_cache,
                     prefill_meta.block_tables,
@@ -613,6 +614,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window,
+                    k_scale,
+                    v_scale,
                 )
                 assert output[:num_prefill_tokens].shape == out.shape
                 output[:num_prefill_tokens] = out

@@ -90,6 +90,7 @@ def forward_prefix(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
+        kv_cache_dtype: str,
         key_cache: torch.Tensor,
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,

@@ -194,6 +194,7 @@ def forward_prefix(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
+        kv_cache_dtype: str,
         key_cache: torch.Tensor,
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
@@ -203,13 +204,16 @@ def forward_prefix(
         max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
+        k_scale: float,
+        v_scale: float,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
         context_attention_fwd(
             query,
             key,
             value,
             output,
+            kv_cache_dtype,
             key_cache,
             value_cache,
             block_tables,
@@ -218,6 +222,8 @@ def forward_prefix(
             seq_lens_tensor,
             context_lens,
             max_query_len,
+            k_scale,
+            v_scale,
             alibi_slopes,
             sliding_window,
         )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -45,5 +45,3 @@ Here is an example of how to enable this feature:
		# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial,
		# output w/o scaling factors: England, located in the southeastern part of the country. It is known

		Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,5 +32,3 @@ Here is an example of how to enable this feature:
		print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


		Note, current prefix caching doesn't work with FP8 KV cache enabled, forward_prefix kernel should handle different KV and cache type.