From 522b36944f5f1fafc2a22a7a5387f79f98fb6663 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 13 May 2025 11:36:45 +0100 Subject: [PATCH 1/4] Convert `benchmarks` to `ruff format` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- benchmarks/backend_request_func.py | 189 +++-- benchmarks/benchmark_dataset.py | 355 +++++----- benchmarks/benchmark_latency.py | 73 +- .../benchmark_long_document_qa_throughput.py | 91 +-- benchmarks/benchmark_prefix_caching.py | 116 ++-- benchmarks/benchmark_prioritization.py | 122 ++-- benchmarks/benchmark_serving.py | 584 +++++++++------- .../benchmark_serving_structured_output.py | 488 +++++++------ benchmarks/benchmark_throughput.py | 433 +++++++----- benchmarks/benchmark_utils.py | 15 +- .../cutlass_benchmarks/sparse_benchmarks.py | 372 ++++++---- benchmarks/cutlass_benchmarks/utils.py | 33 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 274 ++++---- .../cutlass_benchmarks/weight_shapes.py | 2 +- .../disagg_prefill_proxy_server.py | 27 +- .../disagg_benchmarks/round_robin_proxy.py | 20 +- .../visualize_benchmark_results.py | 44 +- .../fused_kernels/layernorm_rms_benchmarks.py | 141 ++-- benchmarks/kernels/benchmark_aqlm.py | 196 +++--- benchmarks/kernels/benchmark_bitblas.py | 61 +- .../kernels/benchmark_cutlass_fp4_moe.py | 417 ++++++----- .../kernels/benchmark_grouped_gemm_cutlass.py | 363 ++++++---- benchmarks/kernels/benchmark_layernorm.py | 60 +- benchmarks/kernels/benchmark_lora.py | 653 ++++++++++-------- benchmarks/kernels/benchmark_machete.py | 369 +++++----- benchmarks/kernels/benchmark_marlin.py | 190 +++-- benchmarks/kernels/benchmark_moe.py | 402 ++++++----- .../benchmark_moe_permute_unpermute.py | 253 ++++--- .../kernels/benchmark_paged_attention.py | 97 ++- benchmarks/kernels/benchmark_quant.py | 71 +- benchmarks/kernels/benchmark_rmsnorm.py | 57 +- benchmarks/kernels/benchmark_rope.py | 83 ++- .../kernels/benchmark_w8a8_block_fp8.py | 113 ++- .../benchmark_fp8_block_dense_gemm.py | 4 +- benchmarks/kernels/graph_machete_bench.py | 33 +- benchmarks/kernels/utils.py | 52 +- benchmarks/overheads/benchmark_hashing.py | 36 +- benchmarks/pyproject.toml | 56 ++ pyproject.toml | 1 + 40 files changed, 3996 insertions(+), 2952 deletions(-) create mode 100644 benchmarks/pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23f83db01079..7f01c041320b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: ruff args: [--output-format, github, --fix] - id: ruff-format - files: ^(.buildkite).* + files: ^(.buildkite|benchmarks)/.* - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index e6a67fda6827..800d426c6d11 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -12,8 +12,7 @@ import aiohttp import huggingface_hub.constants from tqdm.asyncio import tqdm -from transformers import (AutoTokenizer, PreTrainedTokenizer, - PreTrainedTokenizerFast) +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast # NOTE(simon): do not import vLLM here so the benchmark script # can run without vLLM installed. @@ -43,8 +42,7 @@ class RequestFuncOutput: latency: float = 0.0 output_tokens: int = 0 ttft: float = 0.0 # Time to first token - itl: list[float] = field( - default_factory=list) # list of inter-token latencies + itl: list[float] = field(default_factory=list) # list of inter-token latencies tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -57,8 +55,9 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: params = { "max_new_tokens": request_func_input.output_len, "do_sample": True, @@ -105,8 +104,7 @@ async def async_request_tgi( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp @@ -133,8 +131,9 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: payload = { "accumulate_tokens": True, "text_input": request_func_input.prompt, @@ -159,8 +158,7 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix("data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -172,8 +170,7 @@ async def async_request_trt_llm( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp @@ -197,9 +194,9 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: - + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: payload = { "model": request_func_input.model, "prompt": request_func_input.prompt, @@ -217,19 +214,21 @@ async def async_request_deepspeed_mii( st = time.perf_counter() try: - async with session.post(url=request_func_input.api_url, - json=payload) as response: + async with session.post( + url=request_func_input.api_url, json=payload + ) as response: if response.status == 200: parsed_resp = await response.json() output.latency = time.perf_counter() - st if "choices" in parsed_resp: - output.generated_text = parsed_resp["choices"][0][ - "text"] + output.generated_text = parsed_resp["choices"][0]["text"] elif "text" in parsed_resp: output.generated_text = parsed_resp["text"][0] else: - output.error = ("Unexpected response format: " - "neither 'choices' nor 'text' found") + output.error = ( + "Unexpected response format: " + "neither 'choices' nor 'text' found" + ) output.success = False output.success = True else: @@ -250,15 +249,17 @@ async def async_request_openai_completions( pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url - assert api_url.endswith( - ("completions", "profile") - ), "OpenAI Completions API URL must end with 'completions' or 'profile'." + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "repetition_penalty": 1.0, @@ -273,9 +274,7 @@ async def async_request_openai_completions( payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -284,8 +283,9 @@ async def async_request_openai_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: first_chunk_received = False async for chunk_bytes in response.content: @@ -293,8 +293,7 @@ async def async_request_openai_completions( if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": data = json.loads(chunk) @@ -314,21 +313,20 @@ async def async_request_openai_completions( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp generated_text += text or "" elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True else: output.success = False output.error = ( "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!") + "This response will be marked as failed!" + ) output.generated_text = generated_text output.latency = most_recent_timestamp - st else: @@ -349,23 +347,22 @@ async def async_request_openai_chat_completions( pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url - assert api_url.endswith( - ("chat/completions", "profile") - ), "OpenAI Chat Completions API URL must end with 'chat/completions'." + assert api_url.endswith(("chat/completions", "profile")), ( + "OpenAI Chat Completions API URL must end with 'chat/completions'." + ) - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, "messages": [ - { - "role": "user", - "content": content - }, + {"role": "user", "content": content}, ], "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, @@ -391,16 +388,16 @@ async def async_request_openai_chat_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) @@ -414,13 +411,11 @@ async def async_request_openai_chat_completions( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) generated_text += content or "" elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + output.output_tokens = usage.get("completion_tokens") most_recent_timestamp = timestamp @@ -446,25 +441,28 @@ async def async_request_openai_audio( ) -> RequestFuncOutput: # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile + api_url = request_func_input.api_url - assert api_url.endswith( - ("transcriptions", "translations" - )), "OpenAI Chat Completions API URL must end with 'transcriptions' " + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' " + ) "or `translations`." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT + ) as session: content = [{"type": "text", "text": request_func_input.prompt}] payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, "language": "en", # Flattened due to multipart/form-data "stream_include_usage": True, - "stream_continuous_usage_stats": True + "stream_continuous_usage_stats": True, } if request_func_input.extra_body: payload.update(request_func_input.extra_body) @@ -479,9 +477,9 @@ def to_bytes(y, sr): buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content['audio']) as f: + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: form = aiohttp.FormData() - form.add_field('file', f, content_type='audio/wav') + form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): form.add_field(key, str(value)) @@ -493,24 +491,22 @@ def to_bytes(y, sr): st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, - data=form, - headers=headers) as response: + async with session.post( + url=api_url, data=form, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) if choices := data.get("choices"): - content = choices[0]["delta"].get( - "content") + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: ttft = timestamp - st @@ -519,12 +515,14 @@ def to_bytes(y, sr): # Decoding phase else: output.itl.append( - timestamp - most_recent_timestamp) + timestamp - most_recent_timestamp + ) generated_text += content or "" elif usage := data.get("usage"): output.output_tokens = usage.get( - "completion_tokens") + "completion_tokens" + ) most_recent_timestamp = timestamp @@ -545,7 +543,7 @@ def to_bytes(y, sr): def get_model(pretrained_model_name_or_path: str) -> str: - if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true": from modelscope import snapshot_download from vllm.model_executor.model_loader.weight_utils import get_lock @@ -556,7 +554,8 @@ def get_model(pretrained_model_name_or_path: str) -> str: model_path = snapshot_download( model_id=pretrained_model_name_or_path, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"], + ) return model_path return pretrained_model_name_or_path @@ -569,23 +568,23 @@ def get_tokenizer( **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: if pretrained_model_name_or_path is not None and not os.path.exists( - pretrained_model_name_or_path): - pretrained_model_name_or_path = get_model( - pretrained_model_name_or_path) + pretrained_model_name_or_path + ): + pretrained_model_name_or_path = get_model(pretrained_model_name_or_path) if tokenizer_mode == "slow": if kwargs.get("use_fast", False): - raise ValueError( - "Cannot use the fast tokenizer in slow tokenizer mode.") + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = False if tokenizer_mode == "mistral": try: from vllm.transformers_utils.tokenizer import MistralTokenizer except ImportError as e: - raise ImportError("MistralTokenizer requires vllm package.\n" - "Please install it with `pip install vllm` " - "to use mistral tokenizer mode.") from e - return MistralTokenizer.from_pretrained( - str(pretrained_model_name_or_path)) + raise ImportError( + "MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode." + ) from e + return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path)) else: return AutoTokenizer.from_pretrained( pretrained_model_name_or_path, @@ -608,7 +607,7 @@ def get_tokenizer( } OPENAI_COMPATIBLE_BACKENDS = [ - k for k, v in ASYNC_REQUEST_FUNCS.items() - if v in (async_request_openai_completions, - async_request_openai_chat_completions) + k + for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, async_request_openai_chat_completions) ] diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 98d3360cd6ff..d8f48644cc00 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -82,14 +82,12 @@ def __init__( self.dataset_path = dataset_path # Set the random seed, ensuring that a None value is replaced with the # default seed. - self.random_seed = (random_seed - if random_seed is not None else self.DEFAULT_SEED) + self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED self.data = None def apply_multimodal_chat_transformation( - self, - prompt: str, - mm_content: Optional[MultiModalDataDict] = None) -> list[dict]: + self, prompt: str, mm_content: Optional[MultiModalDataDict] = None + ) -> list[dict]: """ Transform a prompt and optional multimodal content into a chat format. This method is used for chat models that expect a specific conversation @@ -111,8 +109,7 @@ def load_data(self) -> None: NotImplementedError: If a subclass does not implement this method. """ # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError( - "load_data must be implemented in subclasses.") + raise NotImplementedError("load_data must be implemented in subclasses.") def get_random_lora_request( self, @@ -158,8 +155,9 @@ def get_random_lora_request( return lora_request, lora_tokenizer_cache[lora_id] or tokenizer @abstractmethod - def sample(self, tokenizer: PreTrainedTokenizerBase, - num_requests: int) -> list[SampleRequest]: + def sample( + self, tokenizer: PreTrainedTokenizerBase, num_requests: int + ) -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -177,8 +175,9 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, """ raise NotImplementedError("sample must be implemented in subclasses.") - def maybe_oversample_requests(self, requests: list[SampleRequest], - num_requests: int) -> None: + def maybe_oversample_requests( + self, requests: list[SampleRequest], num_requests: int + ) -> None: """ Oversamples the list of requests if its size is less than the desired number. @@ -189,11 +188,9 @@ def maybe_oversample_requests(self, requests: list[SampleRequest], """ if len(requests) < num_requests: random.seed(self.random_seed) - additional = random.choices(requests, - k=num_requests - len(requests)) + additional = random.choices(requests, k=num_requests - len(requests)) requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", - num_requests) + logger.info("Oversampled requests to reach %d total samples.", num_requests) # ----------------------------------------------------------------------------- @@ -218,14 +215,14 @@ def is_valid_sequence( """ # Check for invalid conditions prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len - < min_len) + output_too_short = (not skip_min_output_len_check) and (output_len < min_len) prompt_too_long = prompt_len > max_prompt_len combined_too_long = (prompt_len + output_len) > max_total_len # Return True if none of the invalid conditions are met - return not (prompt_too_short or output_too_short or prompt_too_long - or combined_too_long) + return not ( + prompt_too_short or output_too_short or prompt_too_long or combined_too_long + ) @cache @@ -257,28 +254,28 @@ def process_image(image: Any) -> Mapping[str, Any]: Raises: ValueError: If the input is not a supported type. """ - if isinstance(image, dict) and 'bytes' in image: - image = Image.open(BytesIO(image['bytes'])) + if isinstance(image, dict) and "bytes" in image: + image = Image.open(BytesIO(image["bytes"])) if isinstance(image, Image.Image): image = image.convert("RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") - image_base64 = base64.b64encode( - image_data.getvalue()).decode("utf-8") + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") return { "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, } if isinstance(image, str): - image_url = (image if image.startswith( - ("http://", "file://")) else f"file://{image}") + image_url = ( + image if image.startswith(("http://", "file://")) else f"file://{image}" + ) return {"type": "image_url", "image_url": {"url": image_url}} - raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes.") + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image" + " or str or dictionary with raw image bytes." + ) # ----------------------------------------------------------------------------- @@ -318,8 +315,11 @@ def sample( num_special_tokens = tokenizer.num_special_tokens_to_add() real_input_len = input_len - num_special_tokens - prefix_token_ids = (np.random.randint( - 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) + prefix_token_ids = ( + np.random.randint(0, vocab_size, size=prefix_len).tolist() + if prefix_len > 0 + else [] + ) # New sampling logic: [X * (1 - b), X * (1 + b)] input_low = int(real_input_len * (1 - range_ratio)) @@ -329,21 +329,17 @@ def sample( # Add logging for debugging logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, - output_high) - - input_lens = np.random.randint(input_low, - input_high + 1, - size=num_requests) - output_lens = np.random.randint(output_low, - output_high + 1, - size=num_requests) + logger.info("Sampling output_len from [%s, %s]", output_low, output_high) + + input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) + output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) offsets = np.random.randint(0, vocab_size, size=num_requests) requests = [] for i in range(num_requests): - inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % - vocab_size).tolist() + inner_seq = ( + (offsets[i] + i + np.arange(input_lens[i])) % vocab_size + ).tolist() token_sequence = prefix_token_ids + inner_seq prompt = tokenizer.decode(token_sequence) # After decoding the prompt we have to encode and decode it again. @@ -354,8 +350,9 @@ def sample( # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, # the encoded sequence is truncated before being decode again. - re_encoded_sequence = tokenizer.encode( - prompt, add_special_tokens=False)[:input_lens[i]] + re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ + : input_lens[i] + ] prompt = tokenizer.decode(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i]) requests.append( @@ -363,7 +360,8 @@ def sample( prompt=prompt, prompt_len=total_input_len, expected_output_len=int(output_lens[i]), - )) + ) + ) return requests @@ -390,7 +388,8 @@ def load_data(self) -> None: self.data = json.load(f) # Filter entries with at least two conversation turns. self.data = [ - entry for entry in self.data + entry + for entry in self.data if "conversations" in entry and len(entry["conversations"]) >= 2 ] random.seed(self.random_seed) @@ -416,27 +415,28 @@ def sample( ) lora_request, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path + ) prompt_ids = tokenizer(prompt).input_ids completion_ids = tokenizer(completion).input_ids prompt_len = len(prompt_ids) - new_output_len = (len(completion_ids) - if output_len is None else output_len) - if not is_valid_sequence(prompt_len, - new_output_len, - skip_min_output_len_check=output_len - is not None): + new_output_len = len(completion_ids) if output_len is None else output_len + if not is_valid_sequence( + prompt_len, + new_output_len, + skip_min_output_len_check=output_len is not None, + ): continue if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, None) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, - )) + ) + ) self.maybe_oversample_requests(samples, num_requests) return samples @@ -482,20 +482,20 @@ def sample( ) -> list: # Calculate average token length for a poem line. tokenized_lines = [tokenizer(line).input_ids for line in self.data] - avg_len = sum(len(tokens) - for tokens in tokenized_lines) / len(tokenized_lines) + avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) # Build the base prompt. base_prompt = "Pick as many lines as you can from these poem lines:\n" base_msg = [{"role": "user", "content": base_prompt}] - base_fmt = tokenizer.apply_chat_template(base_msg, - add_generation_prompt=True, - tokenize=False) + base_fmt = tokenizer.apply_chat_template( + base_msg, add_generation_prompt=True, tokenize=False + ) base_offset = len(tokenizer(base_fmt).input_ids) if input_len <= base_offset: raise ValueError( f"'input_len' must be higher than the base prompt length " - f"({base_offset}).") + f"({base_offset})." + ) # Determine how many poem lines to use. num_input_lines = round((input_len - base_offset) / avg_len) @@ -504,21 +504,23 @@ def sample( samples = [] while len(samples) < num_requests: - extra_lines = random.choices(self.data, - k=num_input_lines - num_prefix_lines) + extra_lines = random.choices( + self.data, k=num_input_lines - num_prefix_lines + ) prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" msg = [{"role": "user", "content": prompt}] prompt_formatted = tokenizer.apply_chat_template( - msg, add_generation_prompt=True, tokenize=False) + msg, add_generation_prompt=True, tokenize=False + ) prompt_len = len(tokenizer(prompt_formatted).input_ids) if prompt_len <= input_len: samples.append( SampleRequest( - prompt=prompt_formatted - if return_prompt_formatted else prompt, + prompt=prompt_formatted if return_prompt_formatted else prompt, prompt_len=prompt_len, expected_output_len=output_len, - )) + ) + ) return samples @@ -538,7 +540,9 @@ def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self.load_data() - def load_data(self, ): + def load_data( + self, + ): if self.dataset_path is None: raise ValueError("dataset_path must be provided for loading data.") @@ -552,8 +556,7 @@ def load_data(self, ): def _sample_loaded_data(self, num_requests: int) -> list: if num_requests <= len(self.data): - data = self.data.sample(n=num_requests, - random_state=self.random_seed) + data = self.data.sample(n=num_requests, random_state=self.random_seed) else: data = self.data.sample( n=num_requests, @@ -577,7 +580,8 @@ def sample( input_len = int(data[i][2]) output_len = int(data[i][3]) lora_req, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path + ) vocab_size = tokenizer.vocab_size # Generate a synthetic prompt: a list of token IDs computed as (i + # j) modulo vocab_size. @@ -589,7 +593,8 @@ def sample( prompt_len=input_len, expected_output_len=output_len, lora_request=lora_req, - )) + ) + ) return samples @@ -632,20 +637,23 @@ def load_data(self) -> None: class ConversationDataset(HuggingFaceDataset): """Dataset for conversation data with multimodal support.""" + SUPPORTED_DATASET_PATHS = { - 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' + "lmms-lab/LLaVA-OneVision-Data", + "Aeala/ShareGPT_Vicuna_unfiltered", } IS_MULTIMODAL = True - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs) -> list: + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: # Filter examples with at least 2 conversations - filtered_data = self.data.filter( - lambda x: len(x["conversations"]) >= 2) + filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) sampled_requests = [] dynamic_output = output_len is None @@ -661,24 +669,22 @@ def sample(self, completion_len = len(completion_ids) output_len = completion_len if dynamic_output else output_len assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence( - prompt_len, completion_len): + if dynamic_output and not is_valid_sequence(prompt_len, completion_len): continue - mm_content = process_image( - item["image"]) if "image" in item else None + mm_content = process_image(item["image"]) if "image" in item else None if enable_multimodal_chat: # Note: when chat is enabled the request prompt_len is no longer # accurate and we will be using request output to count the # actual prompt len and output len - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) sampled_requests.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, - )) + ) + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests @@ -695,10 +701,8 @@ class VisionArenaDataset(HuggingFaceDataset): DEFAULT_OUTPUT_LEN = 128 SUPPORTED_DATASET_PATHS = { - "lmarena-ai/VisionArena-Chat": - lambda x: x["conversation"][0][0]["content"], - "lmarena-ai/vision-arena-bench-v0.1": - lambda x: x["turns"][0][0]["content"] + "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], + "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], } IS_MULTIMODAL = True @@ -710,16 +714,14 @@ def sample( enable_multimodal_chat: bool = False, **kwargs, ) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] for item in self.data: if len(sampled_requests) >= num_requests: break parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) if parser_fn is None: - raise ValueError( - f"Unsupported dataset path: {self.dataset_path}") + raise ValueError(f"Unsupported dataset path: {self.dataset_path}") prompt = parser_fn(item) mm_content = process_image(item["images"][0]) prompt_len = len(tokenizer(prompt).input_ids) @@ -727,15 +729,15 @@ def sample( # Note: when chat is enabled the request prompt_len is no longer # accurate and we will be using request output to count the # actual prompt len - prompt = self.apply_multimodal_chat_transformation( - prompt, mm_content) + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) sampled_requests.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, - )) + ) + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests @@ -760,14 +762,15 @@ class InstructCoderDataset(HuggingFaceDataset): "likaixin/InstructCoder", } - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] for item in self.data: if len(sampled_requests) >= num_requests: @@ -779,7 +782,8 @@ def sample(self, prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, - )) + ) + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests @@ -794,38 +798,38 @@ class MTBenchDataset(HuggingFaceDataset): MT-Bench Dataset. https://huggingface.co/datasets/philschmid/mt-bench - We create a single turn dataset for MT-Bench. + We create a single turn dataset for MT-Bench. This is similar to Spec decoding benchmark setup in vLLM https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 - """ # noqa: E501 + """ # noqa: E501 DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM SUPPORTED_DATASET_PATHS = { "philschmid/mt-bench", } - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs) -> list: - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = item['turns'][0] + prompt = item["turns"][0] # apply template - prompt = tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True, - tokenize=False) + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( @@ -833,7 +837,8 @@ def sample(self, prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, - )) + ) + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests @@ -847,23 +852,27 @@ class AIMODataset(HuggingFaceDataset): """ Dataset class for processing a AIMO dataset with reasoning questions. """ + SUPPORTED_DATASET_PATHS = { - "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5", - "AI-MO/NuminaMath-CoT" + "AI-MO/aimo-validation-aime", + "AI-MO/NuminaMath-1.5", + "AI-MO/NuminaMath-CoT", } - def sample(self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - **kwargs) -> list: + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: sampled_requests = [] dynamic_output = output_len is None for item in self.data: if len(sampled_requests) >= num_requests: break - prompt, completion = item['problem'], item["solution"] + prompt, completion = item["problem"], item["solution"] prompt_ids = tokenizer(prompt).input_ids completion_ids = tokenizer(completion).input_ids @@ -871,10 +880,9 @@ def sample(self, completion_len = len(completion_ids) output_len = completion_len if dynamic_output else output_len assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence(prompt_len, - completion_len, - max_prompt_len=2048, - max_total_len=32000): + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 + ): continue sampled_requests.append( SampleRequest( @@ -882,7 +890,8 @@ def sample(self, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=None, - )) + ) + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests @@ -905,25 +914,25 @@ def sample(self, ### Response: -""" # noqa: E501 +""" # noqa: E501 def _format_zeta_prompt( - sample: dict, - original_start_marker: str = "<|editable_region_start|>") -> dict: + sample: dict, original_start_marker: str = "<|editable_region_start|>" +) -> dict: """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be further extended to support more NEP datasets. - + Args: - sample: The dataset sample containing events, + sample: The dataset sample containing events, inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to + original_start_marker: The marker indicating the + start of the editable region. Defaults to "<|editable_region_start|>". - + Returns: A dictionary with the formatted prompts and expected outputs. """ @@ -953,10 +962,8 @@ class NextEditPredictionDataset(HuggingFaceDataset): "zed-industries/zeta": _format_zeta_prompt, } - def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, - **kwargs): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get( - self.dataset_path) + def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs): + formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) if formatting_prompt_func is None: raise ValueError(f"Unsupported dataset path: {self.dataset_path}") samples = [] @@ -967,8 +974,10 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, prompt=sample["prompt"], prompt_len=len(tokenizer(sample["prompt"]).input_ids), expected_output_len=len( - tokenizer(sample["expected_output"]).input_ids), - )) + tokenizer(sample["expected_output"]).input_ids + ), + ) + ) if len(samples) >= num_requests: break self.maybe_oversample_requests(samples, num_requests) @@ -997,18 +1006,22 @@ class ASRDataset(HuggingFaceDataset): | AMI | Meetings | Spontaneous | ihm, sdm | +----------------+----------------------------------------+--------------------------+-----------------------------+ - """ # noqa: E501 + """ # noqa: E501 + SUPPORTED_DATASET_PATHS = { - "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium", - "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech" + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", } DEFAULT_OUTPUT_LEN = 128 IS_MULTIMODAL = True # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\ - "<|notimestamps|>" + TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" skip_long_audios: bool = True def sample( @@ -1019,8 +1032,8 @@ def sample( **kwargs, ) -> list: import librosa - output_len = (output_len - if output_len is not None else self.DEFAULT_OUTPUT_LEN) + + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN prompt = ASRDataset.TRANSCRIPTION_PREAMBLE prompt_len = len(tokenizer(prompt).input_ids) sampled_requests = [] @@ -1043,10 +1056,14 @@ def sample( prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, - )) + ) + ) if skipped: - logger.warning("%d samples discarded from dataset due to" \ - " their length being greater than" \ - " what Whisper supports.", skipped) + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index dfd9bb1e6a4d..713fcd0dc15b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -21,13 +21,14 @@ from vllm.utils import FlexibleArgumentParser -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any]) -> None: +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={"latency": results["latencies"]}, - extra_info={k: results[k] - for k in ["avg_latency", "percentiles"]}) + extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, + ) if pt_records: pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" write_to_json(pt_file, pt_records) @@ -42,9 +43,11 @@ def main(args: argparse.Namespace): # the engine will automatically process the request in multiple batches. llm = LLM(**dataclasses.asdict(engine_args)) assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + - args.output_len), ("Please ensure that max_model_len is greater than" - " the sum of input_len and output_len.") + args.input_len + args.output_len + ), ( + "Please ensure that max_model_len is greater than" + " the sum of input_len and output_len." + ) sampling_params = SamplingParams( n=args.n, @@ -55,18 +58,16 @@ def main(args: argparse.Namespace): detokenize=not args.disable_detokenize, ) print(sampling_params) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_prompts: list[PromptType] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: list[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] def llm_generate(): if not args.use_beam_search: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) else: llm.beam_search( dummy_prompts, @@ -80,12 +81,13 @@ def llm_generate(): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir)), + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir) + ), ) as p: llm_generate() print(p.key_averages().table(sort_by="self_cuda_time_total")) @@ -103,8 +105,9 @@ def run_to_completion(profile_dir: Optional[str] = None): if args.profile: profile_dir = args.profile_result_dir if not profile_dir: - profile_dir = (Path(".") / "vllm_benchmark_result" / - f"latency_result_{time.time()}") + profile_dir = ( + Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" + ) print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -135,7 +138,8 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == "__main__": parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " - "requests till completion.") + "requests till completion." + ) parser.add_argument("--input-len", type=int, default=32) parser.add_argument("--output-len", type=int, default=128) parser.add_argument("--batch-size", type=int, default=8) @@ -152,10 +156,9 @@ def run_to_completion(profile_dir: Optional[str] = None): default=10, help="Number of iterations to run for warmup.", ) - parser.add_argument("--num-iters", - type=int, - default=30, - help="Number of iterations to run.") + parser.add_argument( + "--num-iters", type=int, default=30, help="Number of iterations to run." + ) parser.add_argument( "--profile", action="store_true", @@ -165,8 +168,10 @@ def run_to_completion(profile_dir: Optional[str] = None): "--profile-result-dir", type=str, default=None, - help=("path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard."), + help=( + "path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard." + ), ) parser.add_argument( "--output-json", @@ -177,8 +182,10 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( "--disable-detokenize", action="store_true", - help=("Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)"), + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), ) parser = EngineArgs.add_cli_args(parser) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 21480578edbd..109624c87789 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -76,7 +76,7 @@ def repeat_prompts(prompts, repeat_count, mode: str): - 'random': Shuffle the prompts randomly after repetition. - 'tile': Repeat the entire prompt list in sequence. Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. - - 'interleave': Repeat each prompt consecutively before moving to + - 'interleave': Repeat each prompt consecutively before moving to the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. Returns: @@ -86,20 +86,21 @@ def repeat_prompts(prompts, repeat_count, mode: str): ValueError: If an invalid mode is provided. """ print("Repeat mode: ", mode) - if mode == 'random': + if mode == "random": repeated_prompts = prompts * repeat_count random.shuffle(repeated_prompts) return repeated_prompts - elif mode == 'tile': + elif mode == "tile": return prompts * repeat_count - elif mode == 'interleave': + elif mode == "interleave": repeated_prompts = [] for prompt in prompts: repeated_prompts.extend([prompt] * repeat_count) return repeated_prompts else: - raise ValueError(f"Invalid mode: {mode}, only support " - "'random', 'tile', 'interleave'") + raise ValueError( + f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'" + ) def main(args): @@ -109,16 +110,16 @@ def main(args): # we append the document id at the beginning to avoid any of the document # being the prefix of other documents prompts = [ - str(i) + ' '.join(['hi'] * args.document_length) + str(i) + " ".join(["hi"] * args.document_length) for i in range(args.num_documents) ] prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) warmup_prompts = [ - "This is warm up request " + str(i) + \ - ' '.join(['hi'] * args.document_length) - for i in range(args.num_documents)] + "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length) + for i in range(args.num_documents) + ] # Create the LLM engine engine_args = EngineArgs.from_cli_args(args) @@ -142,42 +143,52 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( - description= - 'Benchmark the performance with or without automatic prefix caching.') + description="Benchmark the performance with or " + "without automatic prefix caching." + ) parser.add_argument( - '--document-length', + "--document-length", type=int, # Roughly the number of tokens for a system paper, # excluding images default=20000, - help='Range of input lengths for sampling prompts,' - 'specified as "min:max" (e.g., "128:256").') - - parser.add_argument('--num-documents', - type=int, - default=8, - help='Range of input lengths for sampling prompts,' - 'specified as "min:max" (e.g., "128:256").') - - parser.add_argument('--output-len', type=int, default=10) - - parser.add_argument('--repeat-count', - type=int, - default=2, - help='Number of times to repeat each prompt') - - parser.add_argument("--repeat-mode", - type=str, - default='random', - help='The mode to repeat prompts. The supported ' - 'modes are "random", "tile", and "interleave". ' - 'See repeat_prompts() in the source code for details.') - - parser.add_argument("--shuffle-seed", - type=int, - default=0, - help='Random seed when the repeat mode is "random"') + help="Range of input lengths for sampling prompts, " + 'specified as "min:max" (e.g., "128:256").', + ) + + parser.add_argument( + "--num-documents", + type=int, + default=8, + help="Range of input lengths for sampling prompts, " + 'specified as "min:max" (e.g., "128:256").', + ) + + parser.add_argument("--output-len", type=int, default=10) + + parser.add_argument( + "--repeat-count", + type=int, + default=2, + help="Number of times to repeat each prompt", + ) + + parser.add_argument( + "--repeat-mode", + type=str, + default="random", + help="The mode to repeat prompts. The supported " + 'modes are "random", "tile", and "interleave". ' + "See repeat_prompts() in the source code for details.", + ) + + parser.add_argument( + "--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"', + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index f44da95d3216..ffaa8035797c 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -63,8 +63,7 @@ class Request: output_len: int -def sample_tokens(tokenizer: PreTrainedTokenizerBase, - length: int) -> list[int]: +def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: vocab = tokenizer.get_vocab() all_special_ids = set(tokenizer.all_special_ids) @@ -91,8 +90,10 @@ def sample_requests_from_dataset( # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] # Shuffle the dataset. random.shuffle(dataset) @@ -113,8 +114,9 @@ def sample_requests_from_dataset( completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) - output_len = (len(completion_token_ids) - if fixed_output_len is None else fixed_output_len) + output_len = ( + len(completion_token_ids) if fixed_output_len is None else fixed_output_len + ) if min_len <= prompt_len <= max_len: filtered_requests.append(Request(prompt, prompt_len, output_len)) @@ -128,27 +130,27 @@ def sample_requests_from_random( fixed_output_len: Optional[int], prefix_len: int, ) -> list[Request]: - requests = [] prefix_token_ids = sample_tokens(tokenizer, prefix_len) min_len, max_len = input_length_range for i in range(num_requests): unique_part_token_ids = sample_tokens( - tokenizer, - random.randint(min_len - prefix_len, max_len - prefix_len)) + tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len) + ) prompt_token_ids = prefix_token_ids + unique_part_token_ids prompt = tokenizer.decode(prompt_token_ids) prompt_len = len(prompt_token_ids) - assert (min_len <= prompt_len <= max_len - ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + assert min_len <= prompt_len <= max_len, ( + f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + ) requests.append(Request(prompt, prompt_len, fixed_output_len)) return requests -def repeat_and_sort_requests(requests: list[Request], - repeat_count: int, - sort: bool = False) -> list[str]: +def repeat_and_sort_requests( + requests: list[Request], repeat_count: int, sort: bool = False +) -> list[str]: repeated_requests = requests * repeat_count if sort: repeated_requests.sort(key=lambda x: x[1]) @@ -159,14 +161,14 @@ def repeat_and_sort_requests(requests: list[Request], def main(args): tokenizer = get_tokenizer(args.model, trust_remote_code=True) - input_length_range = tuple(map(int, args.input_length_range.split(':'))) + input_length_range = tuple(map(int, args.input_length_range.split(":"))) random.seed(args.seed) if args.dataset_path is not None: if args.prefix_len > 0: - raise ValueError("prefix-len is not supported when " - "dataset-path is provided.") - print(f"Start to sample {args.num_prompts} prompts " - f"from {args.dataset_path}") + raise ValueError( + "prefix-len is not supported when dataset-path is provided." + ) + print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}") filtered_requests = sample_requests_from_dataset( dataset_path=args.dataset_path, num_requests=args.num_prompts, @@ -196,14 +198,16 @@ def main(args): llm = LLM(**dataclasses.asdict(engine_args)) - sampling_params = SamplingParams(temperature=0, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize) + sampling_params = SamplingParams( + temperature=0, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize, + ) print("Testing filtered requests") - prompts = repeat_and_sort_requests(filtered_requests, - repeat_count=args.repeat_count, - sort=args.sort) + prompts = repeat_and_sort_requests( + filtered_requests, repeat_count=args.repeat_count, sort=args.sort + ) print("------start generating------") test_prefix( @@ -215,29 +219,35 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( - description= - 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the dataset.") - parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--num-prompts', - type=int, - required=True, - help="Number of the prompts sampled from dataset") - parser.add_argument('--repeat-count', - type=int, - default=1, - help='Number of times to repeat each prompt') - parser.add_argument('--sort', - action='store_true', - help='Sort prompts by input length') - parser.add_argument('--input-length-range', - type=str, - required=True, - help='Range of input lengths for sampling prompts,' - 'specified as "min:max" (e.g., "128:256").') + description="Benchmark the performance with or without " + "automatic prefix caching." + ) + parser.add_argument( + "--dataset-path", type=str, default=None, help="Path to the dataset." + ) + parser.add_argument("--output-len", type=int, default=10) + parser.add_argument( + "--num-prompts", + type=int, + required=True, + help="Number of the prompts sampled from dataset", + ) + parser.add_argument( + "--repeat-count", + type=int, + default=1, + help="Number of times to repeat each prompt", + ) + parser.add_argument( + "--sort", action="store_true", help="Sort prompts by input length" + ) + parser.add_argument( + "--input-length-range", + type=str, + required=True, + help="Range of input lengths for sampling prompts," + 'specified as "min:max" (e.g., "128:256").', + ) parser.add_argument( "--prefix-len", type=int, @@ -248,10 +258,12 @@ def main(args): "when dataset-path is not provided.", ) parser.add_argument( - '--disable-detokenize', - action='store_true', - help=("Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)"), + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), ) parser = EngineArgs.add_cli_args(parser) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 76fe00ede249..a05dd24dece8 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Benchmark offline prioritization.""" + import argparse import dataclasses import json @@ -13,7 +14,7 @@ from vllm.utils import FlexibleArgumentParser -#Select a equi-probable random priority +# Select a equi-probable random priority def get_random_flag(): return 0 if random.random() < 0.5 else 1 @@ -33,8 +34,10 @@ def sample_requests( # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] # Shuffle the dataset. random.shuffle(dataset) @@ -51,8 +54,9 @@ def sample_requests( completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len + output_len = ( + len(completion_token_ids) if fixed_output_len is None else fixed_output_len + ) if prompt_len < 4 or output_len < 4: # Prune too short sequences. continue @@ -74,13 +78,16 @@ def run_vllm( disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) assert all( llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " input_len and output_len for all requests.") + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " input_len and output_len for all requests." + ) # Add the requests to the engine. prompts = [] @@ -97,7 +104,8 @@ def run_vllm( ignore_eos=True, max_tokens=output_len, detokenize=not disable_detokenize, - )) + ) + ) start = time.perf_counter() llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True) @@ -111,26 +119,33 @@ def main(args: argparse.Namespace): # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) + args.tokenizer, trust_remote_code=args.trust_remote_code + ) if args.dataset is None: # Synthesize a prompt with the given input length. prompt = "hi" * (args.input_len - 1) - requests = [(prompt, args.input_len, args.output_len, - get_random_flag()) for _ in range(args.num_prompts)] + requests = [ + (prompt, args.input_len, args.output_len, get_random_flag()) + for _ in range(args.num_prompts) + ] else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests( + args.dataset, args.num_prompts, tokenizer, args.output_len + ) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize) + elapsed_time = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len, priority in requests) - print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} tokens/s") + total_num_tokens = sum( + prompt_len + output_len for _, prompt_len, output_len, priority in requests + ) + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} tokens/s" + ) # Output JSON results if specified if args.output_json: @@ -147,41 +162,44 @@ def main(args: argparse.Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument("--backend", - type=str, - choices=["vllm", "hf", "mii"], - default="vllm") - parser.add_argument("--dataset", - type=str, - default=None, - help="Path to the dataset.") - parser.add_argument("--input-len", - type=int, - default=None, - help="Input prompt length for each request") - parser.add_argument("--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--num-prompts", - type=int, - default=200, - help="Number of prompts to process.") parser.add_argument( - '--output-json', + "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" + ) + parser.add_argument( + "--dataset", type=str, default=None, help="Path to the dataset." + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="Input prompt length for each request", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.", + ) + parser.add_argument( + "--n", type=int, default=1, help="Number of generated sequences per prompt." + ) + parser.add_argument( + "--num-prompts", type=int, default=200, help="Number of prompts to process." + ) + parser.add_argument( + "--output-json", type=str, default=None, - help='Path to save the throughput results in JSON format.') + help="Path to save the throughput results in JSON format.", + ) parser.add_argument( - '--disable-detokenize', - action='store_true', - help=("Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)"), + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), ) parser = EngineArgs.add_cli_args(parser) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 89fb0e1df035..883e4e47d028 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -20,6 +20,7 @@ --endpoint /generate_stream to the end of the command above. """ + import argparse import asyncio import gc @@ -34,9 +35,12 @@ from typing import Any, Optional import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -50,12 +54,21 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser -from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset, - ConversationDataset, HuggingFaceDataset, - InstructCoderDataset, MTBenchDataset, - NextEditPredictionDataset, RandomDataset, - SampleRequest, ShareGPTDataset, SonnetDataset, - VisionArenaDataset) +from benchmark_dataset import ( + AIMODataset, + ASRDataset, + BurstGPTDataset, + ConversationDataset, + HuggingFaceDataset, + InstructCoderDataset, + MTBenchDataset, + NextEditPredictionDataset, + RandomDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + VisionArenaDataset, +) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -118,7 +131,8 @@ async def get_request( # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}.") + f"A positive burstiness factor is expected, but given {burstiness}." + ) theta = 1.0 / (request_rate * burstiness) for request in input_requests: @@ -164,8 +178,10 @@ def calculate_metrics( # bundled together # Note : this may inflate the output token count slightly output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + tokenizer( + outputs[i].generated_text, add_special_tokens=False + ).input_ids + ) actual_output_lens.append(output_len) total_input += input_requests[i].prompt_len tpot = 0 @@ -188,16 +204,19 @@ def calculate_metrics( if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(goodput_config_dict["ttft"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(goodput_config_dict["tpot"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(goodput_config_dict["e2el"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) for req_metric in zip(*valid_metrics): is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) @@ -208,7 +227,8 @@ def calculate_metrics( warnings.warn( "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", - stacklevel=2) + stacklevel=2, + ) metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -217,27 +237,31 @@ def calculate_metrics( request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + mean_ttft_ms=np.mean(ttfts or 0) + * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) - for p in selected_percentiles], + percentiles_ttft_ms=[ + (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles + ], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) - for p in selected_percentiles], + percentiles_tpot_ms=[ + (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles + ], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_itl_ms=[ + (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles + ], mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], ) return metrics, actual_output_lens @@ -270,10 +294,12 @@ async def benchmark( raise ValueError(f"Unknown backend: {backend}") print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = \ - input_requests[0].prompt, input_requests[0].prompt_len, \ - input_requests[0].expected_output_len, \ - input_requests[0].multi_modal_data + test_prompt, test_prompt_len, test_output_len, test_mm_content = ( + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( @@ -293,36 +319,36 @@ async def benchmark( if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + f"are correctly specified. Error: {test_output.error}" + ) else: print("Initial test run completed. Starting main benchmark run...") if lora_modules: # For each input request, choose a LoRA module at random. lora_modules = iter( - [random.choice(lora_modules) \ - for _ in range(len(input_requests))]) + [random.choice(lora_modules) for _ in range(len(input_requests))] + ) if profile: print("Starting profiler...") - profile_input = RequestFuncInput(model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body) + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") - if burstiness == 1.0: - distribution = "Poisson process" - else: - distribution = "Gamma distribution" + distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" print(f"Traffic request rate: {request_rate}") print(f"Burstiness factor: {burstiness} ({distribution})") @@ -334,42 +360,45 @@ async def benchmark( # and it will simplify the code in limited_request_func. # semaphore = (asyncio.Semaphore(max_concurrency) # if max_concurrency else contextlib.nullcontext()) - semaphore = (asyncio.Semaphore(max_concurrency) - if max_concurrency else None) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None async def limited_request_func(request_func_input, pbar): if semaphore is None: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = request.prompt, \ - request.prompt_len, request.expected_output_len, \ - request.multi_modal_data + prompt, prompt_len, output_len, mm_content = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + ) req_model_id, req_model_name = model_id, model_name if lora_modules: req_lora_module = next(lora_modules) req_model_id, req_model_name = req_lora_module, req_lora_module - request_func_input = RequestFuncInput(model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body) + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) tasks.append( asyncio.create_task( - limited_request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, pbar=pbar) + ) + ) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -401,22 +430,32 @@ async def limited_request_func(request_func_input, pbar): goodput_config_dict=goodput_config_dict, ) - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.2f}".format("Request throughput (req/s):", - metrics.request_throughput)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", metrics.request_throughput + ) + ) if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", - metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", - metrics.total_token_throughput)) + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total Token throughput (tok/s):", metrics.total_token_throughput + ) + ) result = { "duration": benchmark_duration, @@ -424,8 +463,7 @@ async def limited_request_func(request_func_input, pbar): "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, - "request_goodput:": - metrics.request_goodput if goodput_config_dict else None, + "request_goodput:": metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -448,29 +486,35 @@ def process_one_metric( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"))) + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms") + metrics, f"mean_{metric_attribute_name}_ms" + ) result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms") + metrics, f"median_{metric_attribute_name}_ms" + ) result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}_ms"): + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -490,12 +534,14 @@ def check_goodput_args(args): raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " "The service level objective name should be one of " - f"{str(VALID_NAMES)}. ") + f"{str(VALID_NAMES)}. " + ) if slo_val < 0: raise ValueError( f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " - "non-negative.") + "non-negative." + ) return goodput_config_dict @@ -508,31 +554,42 @@ def parse_goodput(slo_pairs): except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " - "Specify service level objectives for goodput as \"KEY:VALUE\" " + 'Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is a " - "number in milliseconds.") from err + "number in milliseconds." + ) from err return goodput_config_dict -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any], - file_name: str) -> None: +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any], file_name: str +) -> None: metrics = [ - "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", - "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", - "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", ] # These raw data might be useful, but they are rather big. They can be added # later if needed ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] pt_records = convert_to_pytorch_benchmark_format( args=args, - metrics={k: [results[k]] - for k in metrics}, + metrics={k: [results[k]] for k in metrics}, extra_info={ k: results[k] - for k in results if k not in metrics and k not in ignored_metrics - }) + for k in results + if k not in metrics and k not in ignored_metrics + }, + ) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" @@ -557,34 +614,42 @@ def main(args: argparse.Namespace): api_url = f"http://{args.host}:{args.port}{args.endpoint}" base_url = f"http://{args.host}:{args.port}" - tokenizer = get_tokenizer(tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code) + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) if args.dataset_name is None: raise ValueError( "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required.") + "'--dataset-path' if required." + ) if args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": - input_requests = dataset.sample(num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False) + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + ) else: assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset.") - input_requests = dataset.sample(num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True) + "Tokenizer/model must have chat template for sonnet dataset." + ) + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + ) elif args.dataset_name == "hf": # all following datasets are implemented from the @@ -611,23 +676,30 @@ def main(args: argparse.Namespace): dataset_class = ASRDataset args.hf_split = "train" else: - supported_datasets = set([ - dataset_name for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ]) + supported_datasets = set( + [ + dataset_name + for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ] + ) raise ValueError( f"Unsupported dataset path: {args.dataset_path}. " "Huggingface dataset only supports dataset_path" f" from one of following: {supported_datasets}. " "Please consider contributing if you would " - "like to add support for additional dataset formats.") + "like to add support for additional dataset formats." + ) - if (dataset_class.IS_MULTIMODAL and backend not in \ - ["openai-chat", "openai-audio"]): + if dataset_class.IS_MULTIMODAL and backend not in [ + "openai-chat", + "openai-audio", + ]: # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " \ - "'openai-audio' backend.") + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backend." + ) input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, @@ -642,26 +714,24 @@ def main(args: argparse.Namespace): else: # For datasets that follow a similar structure, use a mapping. dataset_mapping = { - "sharegpt": - lambda: ShareGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - ), - "burstgpt": - lambda: BurstGPTDataset(random_seed=args.seed, - dataset_path=args.dataset_path). - sample(tokenizer=tokenizer, num_requests=args.num_prompts), - "random": - lambda: RandomDataset(dataset_path=args.dataset_path).sample( + "sharegpt": lambda: ShareGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": lambda: BurstGPTDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( tokenizer=tokenizer, num_requests=args.num_prompts, prefix_len=args.random_prefix_len, input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, - ) + ), } try: @@ -677,15 +747,16 @@ def main(args: argparse.Namespace): "top_p": args.top_p, "top_k": args.top_k, "min_p": args.min_p, - "temperature": args.temperature - }.items() if v is not None + "temperature": args.temperature, + }.items() + if v is not None } # Sampling parameters are only supported by openai-compatible backend. if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: raise ValueError( - "Sampling parameters are only supported by openai-compatible " - "backends.") + "Sampling parameters are only supported by openai-compatible backends." + ) if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. @@ -709,15 +780,14 @@ def main(args: argparse.Namespace): disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[ - float(p) for p in args.metric_percentiles.split(",") - ], + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], ignore_eos=args.ignore_eos, goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_body=sampling_params, - )) + ) + ) # Save config and results to json if args.save_result or args.append_result: @@ -742,8 +812,9 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) # Traffic - result_json["request_rate"] = (args.request_rate if args.request_rate - < float("inf") else "inf") + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf" + ) result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -753,24 +824,31 @@ def main(args: argparse.Namespace): if not args.save_detailed: # Remove fields with too many data points for field in [ - "input_lens", "output_lens", "ttfts", "itls", - "generated_texts", "errors" + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", ]: if field in result_json: del result_json[field] # Save to file base_model_id = model_id.split("/")[-1] - max_concurrency_str = (f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None else "") - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = ( + f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None + else "" + ) + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa if args.result_filename: file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) - with open(file_name, - mode="a+" if args.append_result else "w", - encoding='utf-8') as outfile: + with open( + file_name, mode="a+" if args.append_result else "w", encoding="utf-8" + ) as outfile: # Append a newline. if args.append_result and outfile.tell() != 0: outfile.write("\n") @@ -780,7 +858,8 @@ def main(args: argparse.Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput.") + description="Benchmark the online serving throughput." + ) parser.add_argument( "--backend", type=str, @@ -809,11 +888,13 @@ def main(args: argparse.Namespace): choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) parser.add_argument( "--max-concurrency", type=int, @@ -825,7 +906,8 @@ def main(args: argparse.Namespace): "initiated, this argument will control how many are actually allowed " "to execute at a time. This means that when used in combination, the " "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.") + "if the server is not processing requests fast enough to keep up.", + ) parser.add_argument( "--model", @@ -836,8 +918,7 @@ def main(args: argparse.Namespace): parser.add_argument( "--tokenizer", type=str, - help= - "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -850,11 +931,13 @@ def main(args: argparse.Namespace): "--logprobs", type=int, default=None, - help=("Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed"), + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), ) parser.add_argument( "--request-rate", @@ -938,35 +1021,38 @@ def main(args: argparse.Namespace): "--ignore-eos", action="store_true", help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) parser.add_argument( "--percentile-metrics", type=str, default="ttft,tpot,itl", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " - "Default value is \"ttft,tpot,itl\".") + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) parser.add_argument( "--metric-percentiles", type=str, default="99", help="Comma-separated list of percentiles for selected metrics. " - "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " - "Default value is \"99\". " - "Use \"--percentile-metrics\" to select metrics.", + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', ) parser.add_argument( "--goodput", nargs="+", required=False, - help="Specify service level objectives for goodput as \"KEY:VALUE\" " + help='Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is in " - "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' "separated by spaces. Allowed request level metric names are " - "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + '"ttft", "tpot", "e2el". For more context on the definition of ' "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") @@ -974,22 +1060,19 @@ def main(args: argparse.Namespace): "--sonnet-input-len", type=int, default=550, - help= - "Number of input tokens per request, used only for sonnet dataset.", + help="Number of input tokens per request, used only for sonnet dataset.", ) sonnet_group.add_argument( "--sonnet-output-len", type=int, default=150, - help= - "Number of output tokens per request, used only for sonnet dataset.", + help="Number of output tokens per request, used only for sonnet dataset.", ) sonnet_group.add_argument( "--sonnet-prefix-len", type=int, default=200, - help= - "Number of prefix tokens per request, used only for sonnet dataset.", + help="Number of prefix tokens per request, used only for sonnet dataset.", ) sharegpt_group = parser.add_argument_group("sharegpt dataset options") @@ -998,22 +1081,21 @@ def main(args: argparse.Namespace): type=int, default=None, help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.") + "from the ShareGPT dataset.", + ) random_group = parser.add_argument_group("random dataset options") random_group.add_argument( "--random-input-len", type=int, default=1024, - help= - "Number of input tokens per request, used only for random sampling.", + help="Number of input tokens per request, used only for random sampling.", ) random_group.add_argument( "--random-output-len", type=int, default=128, - help= - "Number of output tokens per request, used only for random sampling.", + help="Number of output tokens per request, used only for random sampling.", ) random_group.add_argument( "--random-range-ratio", @@ -1028,23 +1110,23 @@ def main(args: argparse.Namespace): "--random-prefix-len", type=int, default=0, - help=("Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]."), + help=( + "Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]." + ), ) hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - hf_group.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + hf_group.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) hf_group.add_argument( "--hf-output-len", type=int, @@ -1058,52 +1140,58 @@ def main(args: argparse.Namespace): "--top-p", type=float, default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-p sampling parameter. Only has effect on openai-compatible backends.", + ) sampling_group.add_argument( "--top-k", type=int, default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-k sampling parameter. Only has effect on openai-compatible backends.", + ) sampling_group.add_argument( "--min-p", type=float, default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Min-p sampling parameter. Only has effect on openai-compatible backends.", + ) sampling_group.add_argument( "--temperature", type=float, default=None, help="Temperature sampling parameter. Only has effect on " "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).") + "decoding (i.e. temperature==0.0).", + ) parser.add_argument( - '--tokenizer-mode', + "--tokenizer-mode", type=str, default="auto", - choices=['auto', 'slow', 'mistral', 'custom'], + choices=["auto", "slow", "mistral", "custom"], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' - 'always use the slow tokenizer. \n* ' + "always use the slow tokenizer. \n* " '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.') - - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ") - - parser.add_argument("--lora-modules", - nargs='+', - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.") + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) + + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) + + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) args = parser.parse_args() diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 9084255d2440..5432e0d60787 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -19,6 +19,7 @@ --endpoint /generate_stream to the end of the command above. """ + import argparse import asyncio import copy @@ -36,8 +37,11 @@ import datasets import numpy as np import pandas as pd -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + RequestFuncInput, + RequestFuncOutput, +) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -52,7 +56,8 @@ from argparse import ArgumentParser as FlexibleArgumentParser from vllm.v1.structured_output.backend_xgrammar import ( - has_xgrammar_unsupported_json_features) + has_xgrammar_unsupported_json_features, +) MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -98,6 +103,7 @@ class SampleRequest: prompt_len: The length of the prompt in tokens. expected_output_len: The expected length of the output in tokens. """ + prompt: str prompt_len: int expected_output_len: int @@ -106,32 +112,28 @@ class SampleRequest: completion: str = None -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> list[SampleRequest]: - if args.dataset == 'json' or args.dataset == 'json-unique': +def sample_requests( + tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace +) -> list[SampleRequest]: + if args.dataset == "json" or args.dataset == "json-unique": if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) - args.json_schema_path = os.path.join(dir_path, - "structured_schemas", - "structured_schema_1.json") + args.json_schema_path = os.path.join( + dir_path, "structured_schemas", "structured_schema_1.json" + ) json_schemas = [] with open(args.json_schema_path) as f: schema = json.load(f) - if args.dataset == 'json-unique': - json_schemas = [ - copy.deepcopy(schema) for _ in range(args.num_prompts) - ] + if args.dataset == "json-unique": + json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)] for i in range(len(json_schemas)): if "properties" not in json_schemas[i]: json_schemas[i]["properties"] = {} - json_schemas[i]["properties"][ - f"__optional_field_{uuid.uuid4()}"] = { - "type": - "string", - "description": - "An unique optional field to avoid cached schemas" - } + json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = { + "type": "string", + "description": "An unique optional field to avoid cached schemas", + } else: json_schemas = [schema] * args.num_prompts @@ -142,11 +144,13 @@ def get_schema(index: int): return json_schemas[index % len(json_schemas)] requests = [ - SampleRequest(prompt=gen_prompt(i), - prompt_len=len(tokenizer(gen_prompt(i)).input_ids), - expected_output_len=args.output_len, - schema=get_schema(i), - structure_type=args.structure_type) + SampleRequest( + prompt=gen_prompt(i), + prompt_len=len(tokenizer(gen_prompt(i)).input_ids), + expected_output_len=args.output_len, + schema=get_schema(i), + structure_type=args.structure_type, + ) for i in range(args.num_prompts) ] @@ -170,11 +174,13 @@ def get_schema(index: int): input_len = len(tokenizer(prompt).input_ids) print(f"Input length of the prompt: {input_len} tokens") requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type) + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + ) for _ in range(args.num_prompts) ] @@ -188,11 +194,13 @@ def get_schema(index: int): input_len = len(tokenizer(prompt).input_ids) print(f"Input length of the prompt: {input_len} tokens") requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=regex, - structure_type=args.structure_type) + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type, + ) for _ in range(args.num_prompts) ] @@ -203,48 +211,55 @@ def get_schema(index: int): input_len = len(tokenizer(prompt).input_ids) print(f"Input length of the prompt: {input_len} tokens") requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=choice, - structure_type=args.structure_type) + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type, + ) for _ in range(args.num_prompts) ] elif args.dataset == "xgrammar_bench": requests: list[SampleRequest] = [] - dataset = datasets.load_dataset("NousResearch/json-mode-eval", - split="train") + dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") full_dataset_len = len(dataset) def _filter_func(item): import json + schema = json.loads(item["schema"]) return not has_xgrammar_unsupported_json_features(schema) dataset = dataset.filter(_filter_func) num_filtered_out = full_dataset_len - len(dataset) - print(f"dataset has {len(dataset)} entries after filtering " - f"out {num_filtered_out} entries with unsupported features") + print( + f"dataset has {len(dataset)} entries after filtering " + f"out {num_filtered_out} entries with unsupported features" + ) len_dataset = len(dataset) for data_point_idx in range(args.num_prompts): idx = data_point_idx while idx >= len_dataset: idx -= len_dataset schema = dataset["schema"][idx] - prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], - tokenize=False, - add_generation_prompt=True) + prompt = tokenizer.apply_chat_template( + dataset["prompt"][idx], tokenize=False, add_generation_prompt=True + ) input_len = len(tokenizer(prompt).input_ids) completion = dataset["completion"][idx] requests.append( - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type, - completion=completion)) + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + completion=completion, + ) + ) return requests @@ -276,7 +291,8 @@ async def get_request( # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}.") + f"A positive burstiness factor is expected, but given {burstiness}." + ) theta = 1.0 / (request_rate * burstiness) for i, request in enumerate(input_requests): @@ -318,8 +334,8 @@ def calculate_metrics( # multiple output tokens may be bundled together # Note : this may inflate the output token count slightly output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids + ) actual_output_lens.append(output_len) total_input += input_requests[i].prompt_len tpot = 0 @@ -343,16 +359,19 @@ def calculate_metrics( if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(goodput_config_dict["ttft"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(goodput_config_dict["tpot"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(goodput_config_dict["e2el"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append( + goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) for req_metric in zip(*valid_metrics): is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) @@ -363,7 +382,8 @@ def calculate_metrics( warnings.warn( "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", - stacklevel=2) + stacklevel=2, + ) metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -372,27 +392,31 @@ def calculate_metrics( request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + mean_ttft_ms=np.mean(ttfts or 0) + * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) - for p in selected_percentiles], + percentiles_ttft_ms=[ + (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles + ], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) - for p in selected_percentiles], + percentiles_tpot_ms=[ + (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles + ], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_itl_ms=[ + (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles + ], mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], ) return metrics, actual_output_lens @@ -429,12 +453,13 @@ def prepare_extra_body(request) -> dict: print("Starting initial single prompt test run...") structured_output_req_idx = random.sample( - range(len(input_requests)), - int(len(input_requests) * structured_output_ratio)) + range(len(input_requests)), int(len(input_requests) * structured_output_ratio) + ) test_request = input_requests[0] - test_req_extra_body = (prepare_extra_body(test_request) - if 0 in structured_output_req_idx else None) + test_req_extra_body = ( + prepare_extra_body(test_request) if 0 in structured_output_req_idx else None + ) test_input = RequestFuncInput( model=model_id, prompt=test_request.prompt, @@ -448,7 +473,8 @@ def prepare_extra_body(request) -> dict: if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + f"are correctly specified. Error: {test_output.error}" + ) else: print("Initial test run completed. Starting main benchmark run...") @@ -467,10 +493,7 @@ def prepare_extra_body(request) -> dict: if profile_output.success: print("Profiler started") - if burstiness == 1.0: - distribution = "Poisson process" - else: - distribution = "Gamma distribution" + distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" print(f"Traffic request rate: {request_rate}") print(f"Burstiness factor: {burstiness} ({distribution})") @@ -482,24 +505,21 @@ def prepare_extra_body(request) -> dict: # and it will simplify the code in limited_request_func. # semaphore = (asyncio.Semaphore(max_concurrency) # if max_concurrency else contextlib.nullcontext()) - semaphore = (asyncio.Semaphore(max_concurrency) - if max_concurrency else None) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None async def limited_request_func(request_func_input, pbar): if semaphore is None: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] expected: list[str] = [] - async for i, request in get_request(input_requests, request_rate, - burstiness): - extra_body = prepare_extra_body( - request) if i in structured_output_req_idx else None + async for i, request in get_request(input_requests, request_rate, burstiness): + extra_body = ( + prepare_extra_body(request) if i in structured_output_req_idx else None + ) request_func_input = RequestFuncInput( model=model_id, prompt=request.prompt, @@ -512,8 +532,9 @@ async def limited_request_func(request_func_input, pbar): expected.append(request.completion) tasks.append( asyncio.create_task( - limited_request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, pbar=pbar) + ) + ) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -545,54 +566,58 @@ async def limited_request_func(request_func_input, pbar): goodput_config_dict=goodput_config_dict, ) - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.2f}".format("Request throughput (req/s):", - metrics.request_throughput)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", metrics.request_throughput + ) + ) if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", - metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", - metrics.total_token_throughput)) + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total Token throughput (tok/s):", metrics.total_token_throughput + ) + ) result = { - "duration": - benchmark_duration, - "completed": - metrics.completed, - "total_input_tokens": - metrics.total_input, - "total_output_tokens": - metrics.total_output, - "request_throughput": - metrics.request_throughput, - "output_throughput": - metrics.output_throughput, - "total_token_throughput": - metrics.total_token_throughput, - "ttft_description": - pd.Series([output.ttft for output in outputs]).describe().to_dict(), - "tpot_description": - pd.Series([output.tpot for output in outputs]).describe().to_dict(), + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "ttft_description": pd.Series([output.ttft for output in outputs]) + .describe() + .to_dict(), + "tpot_description": pd.Series([output.tpot for output in outputs]) + .describe() + .to_dict(), "input_lens": [output.prompt_len for output in outputs], - "output_lens": - actual_output_lens, + "output_lens": actual_output_lens, "ttfts": [output.ttft for output in outputs], "itls": [output.itl for output in outputs], "errors": [output.error for output in outputs], } - ret = [{ - 'generated': output.generated_text, - 'expected': gt - } for output, gt in zip(outputs, expected)] + ret = [ + {"generated": output.generated_text, "expected": gt} + for output, gt in zip(outputs, expected) + ] def process_one_metric( # E.g., "ttft" @@ -606,29 +631,35 @@ def process_one_metric( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"))) + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms") + metrics, f"mean_{metric_attribute_name}_ms" + ) result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms") + metrics, f"median_{metric_attribute_name}_ms" + ) result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}_ms"): + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -638,13 +669,13 @@ def process_one_metric( def evaluate(ret, args): - def _eval_correctness_json(expected, actual): # extract json string from string using regex import re - actual = actual.replace('\n', '').replace(' ', '').strip() + + actual = actual.replace("\n", "").replace(" ", "").strip() try: - actual = re.search(r'\{.*\}', actual).group() + actual = re.search(r"\{.*\}", actual).group() actual = json.loads(actual) except Exception: return False @@ -656,28 +687,32 @@ def _eval_correctness_choice(expected, actual): def _eval_correctness_regex(expected, actual): import re + return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == 'guided_json': + if args.structure_type == "guided_json": return _eval_correctness_json(expected, actual) - elif args.structure_type == 'guided_regex': + elif args.structure_type == "guided_regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == 'guided_choice': + elif args.structure_type == "guided_choice": return _eval_correctness_choice(expected, actual) else: return None scores = [] for res in ret: - score = _eval_correctness(res['expected'], res['generated']) - res['correctness'] = score + score = _eval_correctness(res["expected"], res["generated"]) + res["correctness"] = score scores.append(score) not_none_scores = [score for score in scores if score is not None] - return (sum(not_none_scores) / len(not_none_scores) * - 100) if len(not_none_scores) > 0 else None + return ( + (sum(not_none_scores) / len(not_none_scores) * 100) + if len(not_none_scores) > 0 + else None + ) def parse_goodput(slo_pairs): @@ -689,9 +724,10 @@ def parse_goodput(slo_pairs): except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " - "Specify service level objectives for goodput as \"KEY:VALUE\" " + 'Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is a " - "number in milliseconds.") from err + "number in milliseconds." + ) from err return goodput_config_dict @@ -705,12 +741,14 @@ def check_goodput_args(args): raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " "The service level objective name should be one of " - f"{str(VALID_NAMES)}. ") + f"{str(VALID_NAMES)}. " + ) if slo_val < 0: raise ValueError( f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " - "non-negative.") + "non-negative." + ) return goodput_config_dict @@ -736,19 +774,19 @@ def main(args: argparse.Namespace): tokenizer_mode=args.tokenizer_mode, ) - if args.dataset == 'grammar': - args.structure_type = 'guided_grammar' - elif args.dataset == 'regex': - args.structure_type = 'guided_regex' - elif args.dataset == 'choice': - args.structure_type = 'guided_choice' + if args.dataset == "grammar": + args.structure_type = "guided_grammar" + elif args.dataset == "regex": + args.structure_type = "guided_regex" + elif args.dataset == "choice": + args.structure_type = "guided_choice" else: - args.structure_type = 'guided_json' + args.structure_type = "guided_json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f'{args.structured_output_ratio}guided' + result_file_name = f"{args.structured_output_ratio}guided" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" @@ -776,36 +814,29 @@ def main(args: argparse.Namespace): disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[ - float(p) for p in args.metric_percentiles.split(",") - ], + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], ignore_eos=args.ignore_eos, max_concurrency=args.max_concurrency, structured_output_ratio=args.structured_output_ratio, goodput_config_dict=goodput_config_dict, - )) + ) + ) # Save config and results to json score = evaluate(ret, args) - print("correct_rate(%)", score, '\n') + print("correct_rate(%)", score, "\n") if args.save_results: results = { - "backend": - backend, - "model_id": - model_id, - "tokenizer_id": - tokenizer_id, - "num_prompts": - args.num_prompts, - "request_rate": - args.request_rate if args.request_rate < float("inf") else "inf", - "burstiness": - args.burstiness, - "max_concurrency": - args.max_concurrency, - "correct_rate(%)": - score + "backend": backend, + "model_id": model_id, + "tokenizer_id": tokenizer_id, + "num_prompts": args.num_prompts, + "request_rate": args.request_rate + if args.request_rate < float("inf") + else "inf", + "burstiness": args.burstiness, + "max_concurrency": args.max_concurrency, + "correct_rate(%)": score, } results = {"outputs": ret, **results, **benchmark_result} @@ -814,13 +845,14 @@ def main(args: argparse.Namespace): result_file_name = args.result_filename if args.result_dir: result_file_name = os.path.join(args.result_dir, result_file_name) - with open(result_file_name, "w", encoding='utf-8') as outfile: + with open(result_file_name, "w", encoding="utf-8") as outfile: json.dump(results, outfile, indent=4) if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput.") + description="Benchmark the online serving throughput." + ) parser.add_argument( "--backend", type=str, @@ -842,16 +874,14 @@ def main(args: argparse.Namespace): default="/v1/completions", help="API endpoint.", ) - parser.add_argument("--dataset", - default='json', - choices=[ - 'json', 'json-unique', 'grammar', 'regex', - 'choice', 'xgrammar_bench' - ]) - parser.add_argument("--json-schema-path", - type=str, - default=None, - help="Path to json schema.") + parser.add_argument( + "--dataset", + default="json", + choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"], + ) + parser.add_argument( + "--json-schema-path", type=str, default=None, help="Path to json schema." + ) parser.add_argument( "--max-concurrency", type=int, @@ -863,7 +893,8 @@ def main(args: argparse.Namespace): "initiated, this argument will control how many are actually allowed " "to execute at a time. This means that when used in combination, the " "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.") + "if the server is not processing requests fast enough to keep up.", + ) parser.add_argument( "--model", type=str, @@ -873,15 +904,13 @@ def main(args: argparse.Namespace): parser.add_argument( "--tokenizer", type=str, - help= - "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument( "--tokenizer-mode", type=str, default="auto", - help= - "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument( "--num-prompts", @@ -958,44 +987,51 @@ def main(args: argparse.Namespace): "--ignore-eos", action="store_true", help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) parser.add_argument( "--percentile-metrics", type=str, default="ttft,tpot,itl", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " - "Default value is \"ttft,tpot,itl\".") + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) parser.add_argument( "--metric-percentiles", type=str, default="99", help="Comma-separated list of percentiles for selected metrics. " - "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " - "Default value is \"99\". " - "Use \"--percentile-metrics\" to select metrics.", + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', ) parser.add_argument( "--goodput", nargs="+", required=False, - help="Specify service level objectives for goodput as \"KEY:VALUE\" " + help='Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is in " - "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' "separated by spaces. Allowed request level metric names are " - "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + '"ttft", "tpot", "e2el". For more context on the definition of ' "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") - - parser.add_argument("--no-structured-output", - action='store_true', - default=False, - help="Whether to disable JSON decoding or not.") - parser.add_argument("--structured-output-ratio", - type=float, - default=1.0, - help="Ratio of Structured Outputs requests") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + + parser.add_argument( + "--no-structured-output", + action="store_true", + default=False, + help="Whether to disable JSON decoding or not.", + ) + parser.add_argument( + "--structured-output-ratio", + type=float, + default=1.0, + help="Ratio of Structured Outputs requests", + ) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index cd6c76ad6096..dff0fc862564 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Benchmark offline inference throughput.""" + import argparse import dataclasses import json @@ -11,18 +12,25 @@ import torch import uvloop -from benchmark_dataset import (AIMODataset, BurstGPTDataset, - ConversationDataset, InstructCoderDataset, - RandomDataset, SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) +from benchmark_dataset import ( + AIMODataset, + BurstGPTDataset, + ConversationDataset, + InstructCoderDataset, + RandomDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + VisionArenaDataset, +) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm import tqdm -from transformers import (AutoModelForCausalLM, AutoTokenizer, - PreTrainedTokenizerBase) +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) + build_async_engine_client_from_engine_args, +) from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput @@ -37,23 +45,30 @@ def run_vllm( disable_detokenize: bool = False, ) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) assert all( - llm.llm_engine.model_config.max_model_len >= ( - request.prompt_len + request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) # Add the requests to the engine. prompts: list[Union[TextPrompt, TokensPrompt]] = [] sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( - TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data) - if "prompt_token_ids" in request.prompt else \ - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) + TokensPrompt( + prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data, + ) + if "prompt_token_ids" in request.prompt + else TextPrompt( + prompt=request.prompt, multi_modal_data=request.multi_modal_data + ) + ) sampling_params.append( SamplingParams( n=n, @@ -62,7 +77,8 @@ def run_vllm( ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, - )) + ) + ) lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: lora_requests = [request.lora_request for request in requests] @@ -72,10 +88,9 @@ def run_vllm( outputs = None if not use_beam_search: start = time.perf_counter() - outputs = llm.generate(prompts, - sampling_params, - lora_request=lora_requests, - use_tqdm=True) + outputs = llm.generate( + prompts, sampling_params, lora_request=lora_requests, use_tqdm=True + ) end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" @@ -91,30 +106,35 @@ def run_vllm( beam_width=n, max_tokens=output_len, ignore_eos=True, - )) + ), + ) end = time.perf_counter() return end - start, outputs def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, +) -> tuple[float, list[RequestOutput]]: """ Run vLLM chat benchmark. This function is recommended ONLY for benchmarking multimodal models as it properly handles multimodal inputs and chat formatting. For non-multimodal models, use run_vllm() instead. """ from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) assert all( - llm.llm_engine.model_config.max_model_len >= ( - request.prompt_len + request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests.") + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." + ) prompts = [] sampling_params: list[SamplingParams] = [] @@ -128,7 +148,8 @@ def run_vllm_chat( ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, - )) + ) + ) start = time.perf_counter() outputs = llm.chat(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() @@ -145,14 +166,17 @@ async def run_vllm_async( from vllm import SamplingParams async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing) as llm: + engine_args, disable_frontend_multiprocessing + ) as llm: model_config = await llm.get_model_config() assert all( - model_config.max_model_len >= (request.prompt_len + - request.expected_output_len) - for request in requests), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests.") + model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) # Add the requests to the engine. prompts: list[Union[TextPrompt, TokensPrompt]] = [] @@ -160,11 +184,15 @@ async def run_vllm_async( lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( - TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data) - if "prompt_token_ids" in request.prompt else \ - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) + TokensPrompt( + prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data, + ) + if "prompt_token_ids" in request.prompt + else TextPrompt( + prompt=request.prompt, multi_modal_data=request.multi_modal_data + ) + ) sampling_params.append( SamplingParams( n=n, @@ -173,17 +201,16 @@ async def run_vllm_async( ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, - )) + ) + ) lora_requests.append(request.lora_request) generators = [] start = time.perf_counter() - for i, (prompt, sp, - lr) in enumerate(zip(prompts, sampling_params, lora_requests)): - generator = llm.generate(prompt, - sp, - lora_request=lr, - request_id=f"test{i}") + for i, (prompt, sp, lr) in enumerate( + zip(prompts, sampling_params, lora_requests) + ): + generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") generators.append(generator) all_gens = merge_async_iterators(*generators) async for i, res in all_gens: @@ -202,7 +229,8 @@ def run_hf( disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + ) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token @@ -225,14 +253,15 @@ def run_hf( # Check if we can add more requests to the batch. next_prompt_len = requests[i + 1].prompt_len next_output_len = requests[i + 1].expected_output_len - if (max(max_prompt_len, next_prompt_len) + - max(max_output_len, next_output_len)) <= 2048: + if ( + max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len) + ) <= 2048: # We can add more requests to the batch. continue # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", - padding=True).input_ids + input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), do_sample=True, @@ -262,6 +291,7 @@ def run_mii( output_len: int, ) -> float: from mii import client, serve + llm = serve(model, tensor_parallel=tensor_parallel_size) prompts = [request.prompt for request in requests] @@ -273,8 +303,9 @@ def run_mii( return end - start -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any]) -> None: +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={ @@ -282,9 +313,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, "tokens_per_second": [results["tokens_per_second"]], }, extra_info={ - k: results[k] - for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }) + k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }, + ) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" @@ -316,7 +347,8 @@ def get_requests(args, tokenizer): sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_name == "sonnet": assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset.") + "Tokenizer/model must have chat template for sonnet dataset." + ) dataset_cls = SonnetDataset sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["return_prompt_formatted"] = True @@ -325,21 +357,21 @@ def get_requests(args, tokenizer): elif args.dataset_name == "hf": if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: dataset_cls = VisionArenaDataset - common_kwargs['dataset_subset'] = None - common_kwargs['dataset_split'] = "train" + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: dataset_cls = InstructCoderDataset - common_kwargs['dataset_split'] = "train" + common_kwargs["dataset_split"] = "train" elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: dataset_cls = ConversationDataset - common_kwargs['dataset_subset'] = args.hf_subset - common_kwargs['dataset_split'] = args.hf_split + common_kwargs["dataset_subset"] = args.hf_subset + common_kwargs["dataset_split"] = args.hf_split sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: dataset_cls = AIMODataset - common_kwargs['dataset_subset'] = None - common_kwargs['dataset_split'] = "train" + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -354,10 +386,10 @@ def main(args: argparse.Namespace): random.seed(args.seed) # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) + args.tokenizer, trust_remote_code=args.trust_remote_code + ) requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None - for request in requests) + is_multi_modal = any(request.multi_modal_data is not None for request in requests) request_outputs: Optional[list[RequestOutput]] = None if args.backend == "vllm": if args.async_engine: @@ -368,23 +400,34 @@ def main(args: argparse.Namespace): AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, args.disable_detokenize, - )) + ) + ) else: elapsed_time, request_outputs = run_vllm( - requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + requests, + args.n, + EngineArgs.from_cli_args(args), + args.disable_detokenize, + ) elif args.backend == "hf": assert args.tensor_parallel_size == 1 - elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.hf_max_batch_size, args.trust_remote_code, - args.disable_detokenize) + elapsed_time = run_hf( + requests, + args.model, + tokenizer, + args.n, + args.hf_max_batch_size, + args.trust_remote_code, + args.disable_detokenize, + ) elif args.backend == "mii": - elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, - args.output_len) + elapsed_time = run_mii( + requests, args.model, args.tensor_parallel_size, args.output_len + ) elif args.backend == "vllm-chat": elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), - args.disable_detokenize) + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) else: raise ValueError(f"Unknown backend: {args.backend}") @@ -396,28 +439,31 @@ def main(args: argparse.Namespace): for ro in request_outputs: if not isinstance(ro, RequestOutput): continue - total_prompt_tokens += len( - ro.prompt_token_ids) if ro.prompt_token_ids else 0 - total_output_tokens += sum( - len(o.token_ids) for o in ro.outputs if o) + total_prompt_tokens += ( + len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 + ) + total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) total_num_tokens = total_prompt_tokens + total_output_tokens else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len - for r in requests) + total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) total_output_tokens = sum(r.expected_output_len for r in requests) total_prompt_tokens = total_num_tokens - total_output_tokens if is_multi_modal and args.backend != "vllm-chat": - print("\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details.") + print( + "\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details." + ) # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. # vllm-chat backend counts the image tokens now - print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s" + ) print(f"Total num prompt tokens: {total_prompt_tokens}") print(f"Total num output tokens: {total_output_tokens}") @@ -445,7 +491,8 @@ def validate_args(args): warnings.warn( "The '--dataset' argument will be deprecated in the next release. " "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2) + stacklevel=2, + ) args.dataset_path = args.dataset if not getattr(args, "tokenizer", None): @@ -458,9 +505,8 @@ def validate_args(args): # === Dataset Configuration === if not args.dataset and not args.dataset_path: - print( - "When dataset path is not set, it will default to random dataset") - args.dataset_name = 'random' + print("When dataset path is not set, it will default to random dataset") + args.dataset_name = "random" if args.input_len is None: raise ValueError("input_len must be provided for a random dataset") @@ -468,41 +514,55 @@ def validate_args(args): # --hf-subset and --hf-split: only used # when dataset_name is 'hf' if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None): - warnings.warn("--hf-subset and --hf-split will be ignored \ + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None + ): + warnings.warn( + "--hf-subset and --hf-split will be ignored \ since --dataset-name is not 'hf'.", - stacklevel=2) + stacklevel=2, + ) elif args.dataset_name == "hf": if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS): - assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501 - elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS): - assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501 + VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + | ConversationDataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm-chat", ( + f"{args.dataset_path} needs to use vllm-chat as the backend." + ) # noqa: E501 + elif args.dataset_path in ( + InstructCoderDataset.SUPPORTED_DATASET_PATHS + | AIMODataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm", ( + f"{args.dataset_path} needs to use vllm as the backend." + ) # noqa: E501 else: - raise ValueError( - f"{args.dataset_path} is not supported by hf dataset.") + raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != 'random' and args.random_range_ratio is not None: - warnings.warn("--random-range-ratio will be ignored since \ + if args.dataset_name != "random" and args.random_range_ratio is not None: + warnings.warn( + "--random-range-ratio will be ignored since \ --dataset-name is not 'random'.", - stacklevel=2) + stacklevel=2, + ) # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not # set. - if args.dataset_name not in {"random", "sonnet", None - } and args.prefix_len is not None: - warnings.warn("--prefix-len will be ignored since --dataset-name\ + if ( + args.dataset_name not in {"random", "sonnet", None} + and args.prefix_len is not None + ): + warnings.warn( + "--prefix-len will be ignored since --dataset-name\ is not 'random', 'sonnet', or not set.", - stacklevel=2) + stacklevel=2, + ) # === LoRA Settings === if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError( - "LoRA benchmarking is only supported for vLLM backend") + raise ValueError("LoRA benchmarking is only supported for vLLM backend") if getattr(args, "enable_lora", False) and args.lora_path is None: raise ValueError("LoRA path must be provided when enable_lora is True") @@ -512,8 +572,10 @@ def validate_args(args): if args.backend != "hf" and args.hf_max_batch_size is not None: raise ValueError("HF max batch size is only for HF backend.") - if args.backend in {"hf", "mii"} and getattr(args, "quantization", - None) is not None: + if ( + args.backend in {"hf", "mii"} + and getattr(args, "quantization", None) is not None + ): raise ValueError("Quantization is only for vLLM backend.") if args.backend == "mii" and args.dtype != "auto": @@ -521,29 +583,32 @@ def validate_args(args): if args.backend == "mii" and args.n != 1: raise ValueError("n must be 1 for MII backend.") if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError( - "Tokenizer must be the same as the model for MII backend.") + raise ValueError("Tokenizer must be the same as the model for MII backend.") # --data-parallel is not supported currently. # https://github.com/vllm-project/vllm/issues/16222 if args.data_parallel_size > 1: raise ValueError( "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead") + please use benchmark serving instead" + ) if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument("--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm") + parser.add_argument( + "--backend", + type=str, + choices=["vllm", "hf", "mii", "vllm-chat"], + default="vllm", + ) parser.add_argument( "--dataset-name", type=str, choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], help="Name of the dataset to benchmark on.", - default="sharegpt") + default="sharegpt", + ) parser.add_argument( "--dataset", type=str, @@ -551,57 +616,70 @@ def validate_args(args): help="Path to the ShareGPT dataset, will be deprecated in\ the next release. The dataset is expected to " "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]") - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the dataset") - parser.add_argument("--input-len", - type=int, - default=None, - help="Input prompt length for each request") - parser.add_argument("--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.") - parser.add_argument("--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.") + "list[dict[..., value: ]]]]", + ) + parser.add_argument( + "--dataset-path", type=str, default=None, help="Path to the dataset" + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="Input prompt length for each request", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.", + ) + parser.add_argument( + "--n", type=int, default=1, help="Number of generated sequences per prompt." + ) + parser.add_argument( + "--num-prompts", type=int, default=1000, help="Number of prompts to process." + ) + parser.add_argument( + "--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.", + ) parser.add_argument( - '--output-json', + "--output-json", type=str, default=None, - help='Path to save the throughput results in JSON format.') - parser.add_argument("--async-engine", - action='store_true', - default=False, - help="Use vLLM async engine rather than LLM class.") - parser.add_argument("--disable-frontend-multiprocessing", - action='store_true', - default=False, - help="Disable decoupled async engine frontend.") + help="Path to save the throughput results in JSON format.", + ) + parser.add_argument( + "--async-engine", + action="store_true", + default=False, + help="Use vLLM async engine rather than LLM class.", + ) + parser.add_argument( + "--disable-frontend-multiprocessing", + action="store_true", + default=False, + help="Disable decoupled async engine frontend.", + ) parser.add_argument( "--disable-detokenize", action="store_true", - help=("Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)")) + help=( + "Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)" + ), + ) # LoRA parser.add_argument( "--lora-path", type=str, default=None, help="Path to the LoRA adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.") + "a relative path, or a Hugging Face model identifier.", + ) parser.add_argument( "--prefix-len", type=int, @@ -615,7 +693,8 @@ def validate_args(args): f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " "controls how much of the input is fixed lines versus " "random lines, but the total input length remains approximately " - "input_len tokens.") + "input_len tokens.", + ) # random dataset parser.add_argument( "--random-range-ratio", @@ -629,14 +708,12 @@ def validate_args(args): ) # hf dtaset - parser.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - parser.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") + parser.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + parser.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 45a0ddbd5d08..b0c4fca92c3d 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -7,9 +7,9 @@ from typing import Any -def convert_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: dict[str, list], - extra_info: dict[str, Any]) -> list: +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] +) -> list: """ Save the benchmark results in the format used by PyTorch OSS benchmark with on metric per record @@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, }, } - tp = record["benchmark"]["extra_info"]["args"].get( - "tensor_parallel_size") + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") # Save tensor_parallel_size parameter if it's part of the metadata if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"][ - "tensor_parallel_size"] = extra_info["tensor_parallel_size"] + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( + extra_info["tensor_parallel_size"] + ) records.append(record) @@ -50,7 +50,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, class InfEncoder(json.JSONEncoder): - def clear_inf(self, o: Any): if isinstance(o, dict): return {k: self.clear_inf(v) for k, v in o.items()} diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 9e36b0a9d3bb..da258f98e085 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -23,8 +23,9 @@ # bench -def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, - **kwargs) -> TMeasurement: +def bench_fn( + label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs +) -> TMeasurement: min_run_time = 1 globals = { @@ -41,16 +42,18 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, ).blocked_autorange(min_run_time=min_run_time) -def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_int8( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: assert dtype == torch.int8 b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, - torch.bfloat16) + out = ops.cutlass_scaled_sparse_mm( + a, b_compressed, e, scale_a, scale_b, torch.bfloat16 + ) out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) if not torch.allclose(out, out_ref): @@ -63,54 +66,107 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers = [] # pytorch impl - bfloat16 timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16), - b.to(dtype=torch.bfloat16))) + bench_fn( + label, + sub_label, + "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.bfloat16), + b.to(dtype=torch.bfloat16), + ) + ) # pytorch impl - float16 timers.append( - bench_fn(label, sub_label, - "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, - a.to(dtype=torch.float16), b.to(dtype=torch.float16))) + bench_fn( + label, + sub_label, + "pytorch_fp16_fp16_fp16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.float16), + b.to(dtype=torch.float16), + ) + ) # cutlass impl timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_mm", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + ) + ) # cutlass with bias timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_mm_bias", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) # cutlass sparse impl timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.bfloat16)) + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + ) + ) # cutlass sparse with bias timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.bfloat16, bias)) + bench_fn( + label, + sub_label, + "cutlass_i8_i8_bf16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) return timers -def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_fp8( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: assert dtype == torch.float8_e4m3fn - b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, - k) + b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) - out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b, - torch.bfloat16) + out = ops.cutlass_scaled_sparse_mm( + a, b_compressed, e, scale_a, scale_b, torch.bfloat16 + ) out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16) if not torch.allclose(out, out_ref): @@ -124,97 +180,165 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # pytorch impl w. bf16 timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"))) + bench_fn( + label, + sub_label, + "pytorch_bf16_bf16_bf16_matmul-no-scales", + torch.mm, + a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), + ) + ) # pytorch impl: bf16 output, without fp8 fast accum timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16)) + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + ) + ) # pytorch impl: bf16 output, with fp8 fast accum timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True)) + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + ) # pytorch impl: fp16 output, without fp8 fast accum timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16)) + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + ) + ) # pytorch impl: fp16 output, with fp8 fast accum timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - use_fast_accum=True)) + bench_fn( + label, + sub_label, + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", + torch._scaled_mm, + a, + b, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.float16, + use_fast_accum=True, + ) + ) # cutlass impl: bf16 output timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_mm", + ops.cutlass_scaled_mm, + a, + b, + scale_a, + scale_b, + torch.bfloat16, + ) + ) # cutlass impl: bf16 output timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.bfloat16)) + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + ) + ) # cutlass impl: fp16 output timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.float16)) + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_fp16_scaled_sparse_mm", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.float16, + ) + ) # cutlass impl: bf16 output, with bias timers.append( - bench_fn(label, sub_label, - "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.bfloat16, bias)) + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.bfloat16, + bias, + ) + ) # cutlass impl: fp16 output, with bias timers.append( - bench_fn(label, sub_label, - "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", - ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a, - scale_b, torch.float16, bias.to(dtype=torch.float16))) + bench_fn( + label, + sub_label, + "cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias", + ops.cutlass_scaled_sparse_mm, + a, + b_compressed, + e, + scale_a, + scale_b, + torch.float16, + bias.to(dtype=torch.float16), + ) + ) return timers -def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench( + dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str +) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label) if dtype == torch.float8_e4m3fn: @@ -228,12 +352,12 @@ def print_timers(timers: Iterable[TMeasurement]): compare.print() -def run(dtype: torch.dtype, - MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run( + dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]] +) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: - timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})") + timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})") print_timers(timers) results.extend(timers) @@ -241,10 +365,12 @@ def run(dtype: torch.dtype, # output makers -def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[tuple[int, int, int]], - base_description: str, - timestamp=None): +def make_output( + data: Iterable[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], + base_description: str, + timestamp=None, +): print(f"== All Results {base_description} ====") print_timers(data) @@ -258,8 +384,7 @@ def make_output(data: Iterable[TMeasurement], def run_square_bench(args): - dim_sizes = list( - range(args.dim_start, args.dim_end + 1, args.dim_increment)) + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) data = run(args.dtype, MKNs) @@ -319,7 +444,7 @@ def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: pkl.dump(all_data, f) -if __name__ == '__main__': +if __name__ == "__main__": def to_torch_dtype(dt): if dt == "int8": @@ -344,12 +469,15 @@ def to_torch_dtype(dt): Output: - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument("--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['int8', 'fp8']") + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']", + ) subparsers = parser.add_subparsers(dest="cmd") square_parser = subparsers.add_parser("square_bench") @@ -368,19 +496,19 @@ def to_torch_dtype(dt): range_parser.set_defaults(func=run_range_bench) model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument("--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys()) - model_parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - model_parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) model_parser.set_defaults(func=run_model_bench) args = parser.parse_args() diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index fe4d8fdfc066..7e9f5a7fc0f4 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -10,8 +10,9 @@ def to_fp8(tensor: torch.Tensor) -> torch.Tensor: finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) def to_int8(tensor: torch.Tensor) -> torch.Tensor: @@ -26,10 +27,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: return tensor.to(dtype=torch.float16) -def make_rand_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> tuple[torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 +def make_rand_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device="cuda") * 5 + b = torch.randn((n, k), device="cuda").t() * 5 if dtype == torch.int8: return to_int8(a), to_int8(b) @@ -49,9 +51,7 @@ def prune_to_2_4(tensor): # Create binary mask mask = torch.zeros_like(reshaped) - mask.scatter_(dim=1, - index=indices, - src=torch.ones_like(indices, dtype=mask.dtype)) + mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype)) # Apply mask and reshape back pruned = reshaped * mask @@ -62,10 +62,11 @@ def prune_to_2_4(tensor): return pruned.reshape(original_shape) -def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> tuple[torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device="cuda") * 5 + b = torch.randn((n, k), device="cuda").t() * 5 b = prune_to_2_4(b.t()).t() @@ -86,9 +87,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, return b_compressed, e, a, b -def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, - m: int, n: int, k: int) -> \ - tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: +def make_n_rand_sparse_tensors( + num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int +) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ABs = [] for _ in range(num_tensors): b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index e7b742d8bec9..504c5f5812e3 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -16,7 +16,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul) + w8a8_block_fp8_matmul, +) from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) @@ -25,8 +26,9 @@ # bench -def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, - **kwargs) -> TMeasurement: +def bench_fn( + label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs +) -> TMeasurement: min_run_time = 1 globals = { @@ -44,45 +46,48 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, def bench_int8( - dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 a, b = make_rand_tensors(torch.int8, m, n, k) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) - azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) - azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) + azp = torch.zeros((m,), device="cuda", dtype=torch.int32) + azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32) bench_fns = { - "pytorch_bf16_bf16_bf16_matmul-no-scales": - lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) - ), - "pytorch_fp16_fp16_fp16_matmul-no-scales": - lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), - "cutlass_i8_i8_bf16_scaled_mm": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), - "cutlass_i8_i8_bf16_scaled_mm_bias": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, - bias), - "cutlass_i8_i8_bf16_scaled_mm_azp": - lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. - bfloat16, azp_adj), - "cutlass_i8_i8_bf16_scaled_mm_azp_bias": - lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. - bfloat16, azp_adj, None, bias), - "cutlass_i8_i8_bf16_scaled_mm_azp_pt": - lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. - bfloat16, azp_adj, azp), - "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": - lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. - bfloat16, azp_adj, azp, bias), + "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.float16), b.to(dtype=torch.float16) + ), + "cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16 + ), + "cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16, bias + ), + "cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp + ), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp( + a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias + ), } timers = [] @@ -96,73 +101,65 @@ def bench_int8( def bench_fp8( - dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) a_cont = a.contiguous() scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - block_scale_a = torch.rand((m, k // 128), - device="cuda", - dtype=torch.float32) - block_scale_b = torch.rand((k // 128, n // 128), - device="cuda", - dtype=torch.float32) + block_scale_a = torch.rand((m, k // 128), device="cuda", dtype=torch.float32) + block_scale_b = torch.rand((k // 128, n // 128), device="cuda", dtype=torch.float32) block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t() - bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16) print(m, k, n) bench_fns = { - "pytorch_bf16_bf16_bf16_matmul-no-scales": - lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) - ), - "pytorch_fp16_fp16_fp16_matmul-no-scales": - lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), - "pytorch_fp8_fp8_fp16_scaled_mm": - lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.float16), - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": - lambda: torch._scaled_mm(a, - b, - scale_a, - scale_b, - out_dtype=torch.float16, - use_fast_accum=True), - "pytorch_fp8_fp8_bf16_scaled_mm": - lambda: torch._scaled_mm( - a, b, scale_a, scale_b, out_dtype=torch.bfloat16), - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": - lambda: torch._scaled_mm(a, - b, - scale_a, - scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True), - "cutlass_fp8_fp8_bf16_scaled_mm": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), - "cutlass_fp8_fp8_fp16_scaled_mm": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16), - "cutlass_fp8_fp8_bf16_scaled_mm_bias": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, - bias), - "cutlass_fp8_fp8_fp16_scaled_mm_bias": - lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16, - bias.to(dtype=torch.float16)), - "triton_fp8_fp8_fp16_scaled_mm_blockwise": - lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a, - block_scale_b.t(), (128, 128)), - "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": - lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major, - block_scale_b_K_major, torch.float16), + "pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm( + a.to(dtype=torch.float16), b.to(dtype=torch.float16) + ), + "pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16 + ), + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True + ), + "pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16 + ), + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True + ), + "cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16 + ), + "cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.float16 + ), + "cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.bfloat16, bias + ), + "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( + a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) + ), + "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( + a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) + ), + "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( + a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16 + ), } timers = [] @@ -175,13 +172,15 @@ def bench_fp8( return timers -def bench(dtype: torch.dtype, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: +def bench( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) if dtype == torch.float8_e4m3fn: @@ -195,27 +194,33 @@ def print_timers(timers: Iterable[TMeasurement]): compare.print() -def run(dtype: torch.dtype, - MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: +def run( + dtype: torch.dtype, + MKNs: Iterable[tuple[int, int, int]], + bench_kernels: Optional[list[str]] = None, +) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: - timers = bench(dtype, - m, - k, - n, - f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})", - bench_kernels=bench_kernels) + timers = bench( + dtype, + m, + k, + n, + f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})", + bench_kernels=bench_kernels, + ) print_timers(timers) results.extend(timers) return results -def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[tuple[int, int, int]], - base_description: str, - timestamp=None): +def make_output( + data: Iterable[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], + base_description: str, + timestamp=None, +): print(f"== All Results {base_description} ====") print_timers(data) @@ -226,8 +231,7 @@ def make_output(data: Iterable[TMeasurement], def run_square_bench(args): - dim_sizes = list( - range(args.dim_start, args.dim_end + 1, args.dim_increment)) + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) data = run(args.dtype, MKNs, bench_kernels=args.kernels) make_output(data, MKNs, f"square_bench-{args.dtype}") @@ -285,7 +289,7 @@ def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: pkl.dump(all_data, f) -if __name__ == '__main__': +if __name__ == "__main__": def to_torch_dtype(dt): if dt == "int8": @@ -310,19 +314,21 @@ def to_torch_dtype(dt): Output: - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs. """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) - parser.add_argument("--dtype", - type=to_torch_dtype, - required=True, - help="Available options are ['int8', 'fp8']") + parser.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['int8', 'fp8']", + ) parser.add_argument( "--kernels", nargs="+", type=str, default=None, - help= - "Exact names of the kernels to benchmark. If not set, runs all kernels." + help="Exact names of the kernels to benchmark. If not set, runs all kernels.", ) subparsers = parser.add_subparsers(dest="cmd") @@ -343,19 +349,19 @@ def to_torch_dtype(dt): range_parser.set_defaults(func=run_range_bench) model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument("--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys()) - model_parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - model_parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) model_parser.set_defaults(func=run_model_bench) args = parser.parse_args() diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index 3d1121df40d0..d31b623a1ee6 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -42,4 +42,4 @@ ([8192, 57344], 1), ([28672, 8192], 0), ], -} \ No newline at end of file +} diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 980e68668911..fce156e1c96c 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -12,39 +12,37 @@ async def forward_request(url, data): async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } - async with session.post(url=url, json=data, - headers=headers) as response: + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + async with session.post(url=url, json=data, headers=headers) as response: if response.status == 200: # if response.headers.get('Transfer-Encoding') == 'chunked': if True: - async for chunk_bytes in response.content.iter_chunked( - 1024): + async for chunk_bytes in response.content.iter_chunked(1024): yield chunk_bytes else: content = await response.read() yield content -@app.route('/v1/completions', methods=['POST']) +@app.route("/v1/completions", methods=["POST"]) async def handle_request(): try: original_request_data = await request.get_json() prefill_request = original_request_data.copy() # change max_tokens = 1 to let it only do prefill - prefill_request['max_tokens'] = 1 + prefill_request["max_tokens"] = 1 # finish prefill - async for _ in forward_request('http://localhost:8100/v1/completions', - prefill_request): + async for _ in forward_request( + "http://localhost:8100/v1/completions", prefill_request + ): continue # return decode - generator = forward_request('http://localhost:8200/v1/completions', - original_request_data) + generator = forward_request( + "http://localhost:8200/v1/completions", original_request_data + ) response = await make_response(generator) response.timeout = None @@ -53,11 +51,12 @@ async def handle_request(): except Exception as e: import sys import traceback + exc_info = sys.exc_info() print("Error occurred in disagg prefill proxy server") print(e) print("".join(traceback.format_exception(*exc_info))) -if __name__ == '__main__': +if __name__ == "__main__": app.run(port=8000) diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index c2ad4916bf07..fd19b40bf252 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -8,7 +8,6 @@ class RoundRobinProxy: - def __init__(self, target_ports): self.target_ports = target_ports self.port_cycle = itertools.cycle(self.target_ports) @@ -21,14 +20,15 @@ async def handle_request(self, request): try: # Forward the request async with session.request( - method=request.method, - url=target_url, - headers=request.headers, - data=request.content, + method=request.method, + url=target_url, + headers=request.headers, + data=request.content, ) as response: # Start sending the response - resp = web.StreamResponse(status=response.status, - headers=response.headers) + resp = web.StreamResponse( + status=response.status, headers=response.headers + ) await resp.prepare(request) # Stream the response content @@ -45,11 +45,11 @@ async def handle_request(self, request): async def main(): proxy = RoundRobinProxy([8100, 8200]) app = web.Application() - app.router.add_route('*', '/{path:.*}', proxy.handle_request) + app.router.add_route("*", "/{path:.*}", proxy.handle_request) runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, 'localhost', 8000) + site = web.TCPSite(runner, "localhost", 8000) await site.start() print("Proxy server started on http://localhost:8000") @@ -58,5 +58,5 @@ async def main(): await asyncio.Event().wait() -if __name__ == '__main__': +if __name__ == "__main__": asyncio.run(main()) diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index a7b4b9e8bf30..484d0cb3cba7 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -6,43 +6,41 @@ import pandas as pd if __name__ == "__main__": - data = [] - for name in ['disagg_prefill', 'chunked_prefill']: + for name in ["disagg_prefill", "chunked_prefill"]: for qps in [2, 4, 6, 8]: with open(f"results/{name}-qps-{qps}.json") as f: x = json.load(f) - x['name'] = name - x['qps'] = qps + x["name"] = name + x["qps"] = qps data.append(x) df = pd.DataFrame.from_dict(data) - dis_df = df[df['name'] == 'disagg_prefill'] - chu_df = df[df['name'] == 'chunked_prefill'] + dis_df = df[df["name"] == "disagg_prefill"] + chu_df = df[df["name"] == "chunked_prefill"] - plt.style.use('bmh') - plt.rcParams['font.size'] = 20 + plt.style.use("bmh") + plt.rcParams["font.size"] = 20 for key in [ - 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', - 'median_itl_ms', 'p99_itl_ms' + "mean_ttft_ms", + "median_ttft_ms", + "p99_ttft_ms", + "mean_itl_ms", + "median_itl_ms", + "p99_itl_ms", ]: - fig, ax = plt.subplots(figsize=(11, 7)) - plt.plot(dis_df['qps'], - dis_df[key], - label='disagg_prefill', - marker='o', - linewidth=4) - plt.plot(chu_df['qps'], - chu_df[key], - label='chunked_prefill', - marker='o', - linewidth=4) + plt.plot( + dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4 + ) + plt.plot( + chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4 + ) ax.legend() - ax.set_xlabel('QPS') + ax.set_xlabel("QPS") ax.set_ylabel(key) ax.set_ylim(bottom=0) - fig.savefig(f'results/{key}.png') + fig.savefig(f"results/{key}.png") plt.close(fig) diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 3da583a33448..37a9173a1a93 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -24,10 +24,12 @@ class bench_params_t: dtype: torch.dtype def description(self): - return (f'N {self.num_tokens} ' - f'x D {self.hidden_size} ' - f'x R {self.add_residual} ' - f'x DT {self.dtype}') + return ( + f"N {self.num_tokens} " + f"x D {self.hidden_size} " + f"x R {self.add_residual} " + f"x DT {self.dtype}" + ) def get_bench_params() -> list[bench_params_t]: @@ -38,15 +40,19 @@ def get_bench_params() -> list[bench_params_t]: DTYPES = [torch.bfloat16, torch.float] combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) - bench_params = list(map(lambda x: \ - bench_params_t(x[0], x[1], x[2], x[3]), combinations)) + bench_params = list( + map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations) + ) return bench_params # Reference impls -def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype): +def unfused_int8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): # Norm torch_out = None if residual is None: @@ -58,9 +64,12 @@ def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, torch_out, _, _ = ops.scaled_int8_quant(torch_out) -def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype): +def unfused_fp8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): # Norm torch_out = None if residual is None: @@ -73,22 +82,27 @@ def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, def fused_impl( - rms_norm_layer: RMSNorm, # this stores the weights - x: torch.Tensor, - residual: Optional[torch.Tensor], - quant_dtype: torch.dtype): - out, _ = ops.rms_norm_dynamic_per_token_quant(x, - rms_norm_layer.weight, - 1e-6, - quant_dtype, - residual=residual) + rms_norm_layer: RMSNorm, # this stores the weights + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype, +): + out, _ = ops.rms_norm_dynamic_per_token_quant( + x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual + ) # Bench functions -def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor, - quant_dtype: torch.dtype, label: str, sub_label: str, - fn: Callable, description: str) -> TMeasurement: - +def bench_fn( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: torch.Tensor, + quant_dtype: torch.dtype, + label: str, + sub_label: str, + fn: Callable, + description: str, +) -> TMeasurement: min_run_time = 1 globals = { @@ -106,43 +120,81 @@ def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor, description=description, ).blocked_autorange(min_run_time=min_run_time) -def bench(params: bench_params_t, label: str, sub_label: str) \ - -> Iterable[TMeasurement]: +def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasurement]: # Make inputs layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype) # Make weights layer.weight.data.normal_(mean=1.0, std=0.1) # Make inputs scale = 1 / params.hidden_size - x = torch.randn(params.num_tokens, - params.hidden_size, - dtype=params.dtype, - device='cuda') * scale - residual = (torch.randn_like(x) * scale).to(device='cuda') \ - if params.add_residual else None + x = ( + torch.randn( + params.num_tokens, params.hidden_size, dtype=params.dtype, device="cuda" + ) + * scale + ) + residual = ( + (torch.randn_like(x) * scale).to(device="cuda") if params.add_residual else None + ) timers = [] # unfused int8 impl. timers.append( - bench_fn(layer, x, residual, torch.int8, label, sub_label, - unfused_int8_impl, "unfused_int8_impl")) + bench_fn( + layer, + x, + residual, + torch.int8, + label, + sub_label, + unfused_int8_impl, + "unfused_int8_impl", + ) + ) # unfused fp8 impl. timers.append( - bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label, - unfused_fp8_impl, "unfused_fp8_impl")) + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + label, + sub_label, + unfused_fp8_impl, + "unfused_fp8_impl", + ) + ) # fused int8 impl. timers.append( - bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl, - "fused_int8_impl")) + bench_fn( + layer, + x, + residual, + torch.int8, + label, + sub_label, + fused_impl, + "fused_int8_impl", + ) + ) # fused fp8 impl. timers.append( - bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label, - fused_impl, "fused_fp8_impl")) + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + label, + sub_label, + fused_impl, + "fused_fp8_impl", + ) + ) print_timers(timers) @@ -157,13 +209,12 @@ def print_timers(timers: Iterable[TMeasurement]): def main(): - torch.set_default_device('cuda') + torch.set_default_device("cuda") bench_params = get_bench_params() timers = [] for bp in tqdm(bench_params): - timers.extend( - bench(bp, "rms-norm-dynamic-per-token-quant", bp.description())) + timers.extend(bench(bp, "rms-norm-dynamic-per-token-quant", bp.description())) print_timers(timers) # pickle all the results @@ -172,5 +223,5 @@ def main(): pkl.dump(timers, f) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 8d20b91560dd..e9934aa479dd 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -9,32 +9,39 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, generic_dequantize_gemm, get_int_dtype, - optimized_dequantize_gemm) + dequantize_weight, + generic_dequantize_gemm, + get_int_dtype, + optimized_dequantize_gemm, +) from vllm.utils import FlexibleArgumentParser -os.environ['CUDA_VISIBLE_DEVICES'] = '0' +os.environ["CUDA_VISIBLE_DEVICES"] = "0" def torch_mult( - input: torch.Tensor, # [..., in_features] - weights: torch.Tensor, - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + # [..., in_features] + input: torch.Tensor, + weights: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, ) -> torch.Tensor: output = F.linear(input, weights) return output def dequant_out_scale( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, output_partition_sizes: torch.IntTensor, bias: Optional[torch.Tensor], ) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) if bias is None: @@ -46,40 +53,42 @@ def dequant_out_scale( flattened_output *= b_scales return flattened_output.view(orig_shape) else: - b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( - -1, weights.shape[1]) + b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) weights *= b_scales return F.linear(input, weights, bias) def dequant_weight_scale( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, output_partition_sizes: torch.IntTensor, bias: Optional[torch.Tensor], ) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( - -1, weights.shape[1]) + b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) weights *= b_scales return F.linear(input, weights, bias) def dequant_no_scale( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] + # [..., in_features] + input: torch.Tensor, + # [num_out_groups, num_in_groups, num_codebooks] + codes: torch.IntTensor, + # [num_codebooks, codebook_size, out_group_size, in_group_size] + codebooks: torch.Tensor, + # [num_out_groups, 1, 1, 1] + scales: torch.Tensor, output_partition_sizes: torch.IntTensor, bias: Optional[torch.Tensor], ) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) return F.linear(input, weights, bias) @@ -89,23 +98,26 @@ def dequant_no_scale( # the generic pytorch version. # Just visual comparison. def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - device = torch.device('cuda:0') + device = torch.device("cuda:0") code_range = (1 << bits) // 2 ingroups = 8 - codes = torch.randint(-code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device) + codes = torch.randint( + -code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device, + ) - codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device) + codebooks = torch.randn( + size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device, + ) count = 0 for index in range(16): @@ -138,24 +150,25 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments - parser.add_argument("--nbooks", - type=int, - default=1, - help="Number of codebooks (default: 1)") - parser.add_argument("--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)") + parser.add_argument( + "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" + ) + parser.add_argument( + "--bits", + type=int, + default=16, + help="Number of bits per code element (default: 16)", + ) parser.add_argument( "--test", type=bool, default=False, help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)") + "(default: False)", + ) # Parse the arguments args = parser.parse_args() @@ -165,7 +178,7 @@ def main(): bits = args.bits if args.test: - dequant_test(4096, torch.tensor((4096, )), nbooks, bits) + dequant_test(4096, torch.tensor((4096,)), nbooks, bits) return # Otherwise, benchmark. @@ -184,31 +197,54 @@ def main(): with open(filename, "w") as f: sys.stdout = f - print('m | k | n | n parts', end='') + print("m | k | n | n parts", end="") for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end='') - print('') + print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") + print("") # These are reasonable prefill sizes. - ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )), - (4096, (11008, 11008)), (11008, (4096, ))) + ksandpartions = ( + (4096, (4096, 4096, 4096)), + (4096, (4096,)), + (4096, (11008, 11008)), + (11008, (4096,)), + ) # reasonable ranges for m. for m in [ - 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112, - 128, 256, 512, 1024, 1536, 2048, 3072, 4096 + 1, + 2, + 4, + 8, + 10, + 12, + 14, + 16, + 24, + 32, + 48, + 52, + 56, + 64, + 96, + 112, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, ]: - print(f'{m}', file=sys.__stdout__) + print(f"{m}", file=sys.__stdout__) for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, - methods) + run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, - methods): - +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): # I didn't see visible improvements from increasing these, but feel free :) num_warmup_trials = 1 num_trials = 1 @@ -229,7 +265,7 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, ) n = parts.sum().item() - print(f'{m} | {k} | {n} | {parts.tolist()}', end='') + print(f"{m} | {k} | {n} | {parts.tolist()}", end="") for method in methods: best_time_us = 1e20 @@ -249,32 +285,36 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, if kernel_dur_us < best_time_us: best_time_us = kernel_dur_us - print(f' | {kernel_dur_us:.0f}', end='') + print(f" | {kernel_dur_us:.0f}", end="") - print('') + print("") -def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, - nbooks: int, bits: int, method) -> float: - +def run_timing( + num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method +) -> float: n = int(parts.sum().item()) - device = torch.device('cuda:0') + device = torch.device("cuda:0") input = torch.randn((1, m, k), dtype=torch.float16, device=device) code_range = (1 << bits) // 2 ingroups = 8 - codes = torch.randint(-code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device) - - codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device) + codes = torch.randint( + -code_range, + code_range, + size=(n, k // ingroups, nbooks), + dtype=get_int_dtype(bits), + device=device, + ) + + codebooks = torch.randn( + size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), + dtype=torch.float16, + device=device, + ) scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index b23b4f3ea685..d40ab70ec539 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -3,27 +3,33 @@ # Licensed under the MIT License. from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( - MINIMUM_BITBLAS_VERSION) + MINIMUM_BITBLAS_VERSION, +) try: import bitblas + if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: - raise ImportError("bitblas version is wrong. Please " - f"install bitblas>={MINIMUM_BITBLAS_VERSION}") + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) except ImportError as e: bitblas_import_exception = e - raise ValueError("Trying to use the bitblas backend, but could not import" - f"with the following error: {bitblas_import_exception}. " - "Please install bitblas through the following command: " - f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" - ) from bitblas_import_exception + raise ValueError( + "Trying to use the bitblas backend, but could not import" + f"with the following error: {bitblas_import_exception}. " + "Please install bitblas through the following command: " + f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" + ) from bitblas_import_exception from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target from vllm.utils import FlexibleArgumentParser parser = FlexibleArgumentParser( - description="Benchmark BitBLAS int4 on a specific target.") + description="Benchmark BitBLAS int4 on a specific target." +) # Add arguments to the parser parser.add_argument( @@ -32,10 +38,9 @@ default=auto_detect_nvidia_target(), help="Specify the target device for benchmarking.", ) -parser.add_argument("--group_size", - type=int, - default=None, - help="Group size for grouped quantization.") +parser.add_argument( + "--group_size", type=int, default=None, help="Group size for grouped quantization." +) parser.add_argument( "--A_dtype", type=str, @@ -82,17 +87,17 @@ choices=["nt", "nn"], help="Matrix layout, 'nt' for non-transpose A and transpose W.", ) -parser.add_argument("--with_bias", - action="store_true", - help="Include bias in the benchmark.") +parser.add_argument( + "--with_bias", action="store_true", help="Include bias in the benchmark." +) parser.add_argument( "--with_scaling", action="store_true", help="Include scaling factor in the quantization.", ) -parser.add_argument("--with_zeros", - action="store_true", - help="Include zeros in the quantization.") +parser.add_argument( + "--with_zeros", action="store_true", help="Include zeros in the quantization." +) parser.add_argument( "--zeros_mode", type=str, @@ -170,8 +175,7 @@ ] # Build test shapes with all the shared arguments -test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) - for shape in shapes] +test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes] benchmark_sets = [] benchmark_sets.extend(test_shapes) @@ -206,12 +210,12 @@ func_name = args_split[0] input_args_str = "-".join(args_split[1:]) col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2) - col_widths[1] = max(col_widths[1], - len(input_args_str) + 2, - len(headers[1]) + 2) - col_widths[2] = max(col_widths[2], - len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2, - len(headers[2]) + 2) + col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2) + col_widths[2] = max( + col_widths[2], + len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2, + len(headers[2]) + 2, + ) # break only if you want to measure widths from a single example; # otherwise, let it loop over all items. @@ -232,5 +236,6 @@ f"{values['BitBLAS_top20_latency']:.3f} ms", ] row_str = "".join( - [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]) + [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)] + ) print(row_str) diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 0d091b47c3e1..d39d8a6e3aba 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -5,6 +5,7 @@ activations. The triton_moe kernel takes in fp8 weights(tensor scaled to fp8) and 16-bit activations. """ + import nvtx import torch import torch.utils.benchmark as benchmark @@ -12,8 +13,7 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 -from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts, - fused_topk) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types from vllm.utils import FlexibleArgumentParser @@ -38,19 +38,27 @@ def to_fp8(tensor: torch.Tensor): finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) -def bench_run(results: list[benchmark.Measurement], model: str, - num_experts: int, topk: int, per_act_token: bool, - per_out_ch: bool, mkn: tuple[int, int, int]): +def bench_run( + results: list[benchmark.Measurement], + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): label = "NVFP4 Blockscaled CUTLASS MOE vs FP8 Tensor Scaled Triton" sub_label = ( - "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, " - "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch, - mkn)) + "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( + model, num_experts, topk, per_act_token, per_out_ch, mkn + ) + ) print(f"Testing: {sub_label}") @@ -64,18 +72,12 @@ def bench_run(results: list[benchmark.Measurement], model: str, _, a_fp8_scale = ops.scaled_fp8_quant(a) - w1_fp8q = torch.empty((num_experts, 2 * n, k), - device=device, - dtype=torch.float8_e4m3fn) - w2_fp8q = torch.empty((num_experts, k, n), - device=device, - dtype=torch.float8_e4m3fn) - w1_fp8scale = torch.empty((num_experts, 1, 1), - device=device, - dtype=torch.float32) - w2_fp8scale = torch.empty((num_experts, 1, 1), - device=device, - dtype=torch.float32) + w1_fp8q = torch.empty( + (num_experts, 2 * n, k), device=device, dtype=torch.float8_e4m3fn + ) + w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=torch.float8_e4m3fn) + w1_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + w2_fp8scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) for expert in range(num_experts): w1_fp8q[expert], w1_fp8scale[expert] = ops.scaled_fp8_quant(w1[expert]) @@ -91,26 +93,24 @@ def bench_run(results: list[benchmark.Measurement], model: str, topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False) quant_blocksize = 16 - w1_blockscale = torch.empty((num_experts, 2 * n, k // quant_blocksize), - device=device, - dtype=torch.float8_e4m3fn) - w2_blockscale = torch.empty((num_experts, k, n // quant_blocksize), - device=device, - dtype=torch.float8_e4m3fn) + w1_blockscale = torch.empty( + (num_experts, 2 * n, k // quant_blocksize), + device=device, + dtype=torch.float8_e4m3fn, + ) + w2_blockscale = torch.empty( + (num_experts, k, n // quant_blocksize), device=device, dtype=torch.float8_e4m3fn + ) # n_b_scales = 2 * n if per_out_ch else 1 # k_b_scales = k if per_out_ch else 1 - w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), - device=device, - dtype=torch.uint8) - w2_fp4 = torch.empty((num_experts, k, n // 2), - device=device, - dtype=torch.uint8) - - w1_gs = torch.empty((num_experts, ), device=device, dtype=torch.float32) - w2_gs = torch.empty((num_experts, ), device=device, dtype=torch.float32) - a1_gs = torch.ones((num_experts, ), device=device, dtype=torch.float32) - a2_gs = torch.ones((num_experts, ), device=device, dtype=torch.float32) + w1_fp4 = torch.empty((num_experts, 2 * n, k // 2), device=device, dtype=torch.uint8) + w2_fp4 = torch.empty((num_experts, k, n // 2), device=device, dtype=torch.uint8) + + w1_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) + w2_gs = torch.empty((num_experts,), device=device, dtype=torch.float32) + a1_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) + a2_gs = torch.ones((num_experts,), device=device, dtype=torch.float32) for expert in range(num_experts): w1_e = w1[expert] @@ -121,96 +121,141 @@ def bench_run(results: list[benchmark.Measurement], model: str, w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax w1_fp4[expert], w1_blockscale[expert] = ops.scaled_fp4_quant( - w1_e, w1_gs[expert]) + w1_e, w1_gs[expert] + ) w2_fp4[expert], w2_blockscale[expert] = ops.scaled_fp4_quant( - w2_e, w2_gs[expert]) - - def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: torch.Tensor, w2_scale: torch.Tensor, - a_fp8_scale: torch.Tensor, num_repeats: int): + w2_e, w2_gs[expert] + ) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_fp8_scale: torch.Tensor, + num_repeats: int, + ): for _ in range(num_repeats): - fused_experts(a, - w1, - w2, - topk_weights, - topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale) - - def run_cutlass_moe_fp4(a: torch.Tensor, w1_fp4: torch.Tensor, - w2_fp4: torch.Tensor, w1_blockscale: torch.Tensor, - w2_blockscale: torch.Tensor, w1_gs: torch.Tensor, - w2_gs: torch.Tensor, a1_gs: torch.Tensor, - a2_gs: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, m: int, n: int, k: int, - e: int, device: torch.device, num_repeats: int): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + + def run_cutlass_moe_fp4( + a: torch.Tensor, + w1_fp4: torch.Tensor, + w2_fp4: torch.Tensor, + w1_blockscale: torch.Tensor, + w2_blockscale: torch.Tensor, + w1_gs: torch.Tensor, + w2_gs: torch.Tensor, + a1_gs: torch.Tensor, + a2_gs: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + m: int, + n: int, + k: int, + e: int, + device: torch.device, + num_repeats: int, + ): for _ in range(num_repeats): with nvtx.annotate("cutlass_moe_fp4", color="green"): - cutlass_moe_fp4(a=a, - a1_gscale=a1_gs, - a2_gscale=a2_gs, - w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, - w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - device=device) + cutlass_moe_fp4( + a=a, + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) def run_cutlass_from_graph( - a: torch.Tensor, a1_gscale: torch.Tensor, w1_fp4: torch.Tensor, - w1_blockscale: torch.Tensor, w1_alphas: torch.Tensor, - a2_gscale: torch.Tensor, w2_fp4: torch.Tensor, - w2_blockscale: torch.Tensor, w2_alphas: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, m: int, n: int, - k: int, e: int, device: torch.device): + a: torch.Tensor, + a1_gscale: torch.Tensor, + w1_fp4: torch.Tensor, + w1_blockscale: torch.Tensor, + w1_alphas: torch.Tensor, + a2_gscale: torch.Tensor, + w2_fp4: torch.Tensor, + w2_blockscale: torch.Tensor, + w2_alphas: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + m: int, + n: int, + k: int, + e: int, + device: torch.device, + ): with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - return cutlass_moe_fp4(a=a, - a1_gscale=a1_gs, - w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_alphas, - a2_gscale=a2_gs, - w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_alphas, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - device=device) - - def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor, - w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a_fp8_scale: torch.Tensor): + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return cutlass_moe_fp4( + a=a, + a1_gscale=a1_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_alphas, + a2_gscale=a2_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_alphas, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) + + def run_triton_from_graph( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_fp8_scale: torch.Tensor, + ): with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - return fused_experts(a, - w1, - w2, - topk_weights, - topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale) + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) def replay_graph(graph, num_repeats): for _ in range(num_repeats): @@ -220,30 +265,39 @@ def replay_graph(graph, num_repeats): cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): - run_cutlass_from_graph(a=a, - a1_gscale=a1_gs, - w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, - a2_gscale=a2_gs, - w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, - topk_weights=topk_weights, - topk_ids=topk_ids, - m=m, - n=n, - k=k, - e=num_experts, - device=device) + run_cutlass_from_graph( + a=a, + a1_gscale=a1_gs, + w1_fp4=w1_fp4, + w1_blockscale=w1_blockscale, + w1_alphas=w1_gs, + a2_gscale=a2_gs, + w2_fp4=w2_fp4, + w2_blockscale=w2_blockscale, + w2_alphas=w2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + m=m, + n=n, + k=k, + e=num_experts, + device=device, + ) torch.cuda.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(triton_graph, stream=triton_stream): - run_triton_from_graph(a, w1_fp8q_notransp, w2_fp8q_notransp, - topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, - a_fp8_scale) + run_triton_from_graph( + a, + w1_fp8q_notransp, + w2_fp8q_notransp, + topk_weights, + topk_ids, + w1_fp8scale, + w2_fp8scale, + a_fp8_scale, + ) torch.cuda.synchronize() min_run_time = 5 @@ -290,18 +344,27 @@ def replay_graph(graph, num_repeats): } # Warmup - run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, - topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_warmup) + run_triton_moe( + a, + w1_fp8q_notransp, + w2_fp8q_notransp, + topk_weights, + topk_ids, + w1_fp8scale, + w2_fp8scale, + a_fp8_scale, + num_warmup, + ) results.append( benchmark.Timer( - stmt= - "run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)", # noqa: E501 + stmt="run_triton_moe(a, w1_fp8q_notransp, w2_fp8q_notransp, topk_weights, topk_ids, w1_fp8scale, w2_fp8scale, a_fp8_scale, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="triton_moe", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup replay_graph(triton_graph, num_warmup) @@ -313,23 +376,40 @@ def replay_graph(graph, num_repeats): label=label, sub_label=sub_label, description="triton_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup - run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_gs, - w2_gs, a1_gs, a2_gs, topk_weights, topk_ids, m, n, k, - num_experts, device, num_warmup) + run_cutlass_moe_fp4( + a, + w1_fp4, + w2_fp4, + w1_blockscale, + w2_blockscale, + w1_gs, + w2_gs, + a1_gs, + a2_gs, + topk_weights, + topk_ids, + m, + n, + k, + num_experts, + device, + num_warmup, + ) results.append( benchmark.Timer( - stmt= - "run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)", # noqa: E501 + stmt="run_cutlass_moe_fp4(a, w1_fp4, w2_fp4, w1_blockscale, w2_blockscale, w1_alphas, w2_alphas, a1_gscale, a2_gscale, topk_weights, topk_ids, m, n, k, e, device, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="cutlass_moe_fp4", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup replay_graph(cutlass_graph, num_warmup) @@ -341,7 +421,8 @@ def replay_graph(graph, num_repeats): label=label, sub_label=sub_label, description="cutlass_moe_fp4_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) def main(args): @@ -369,8 +450,15 @@ def main(args): for per_out_ch in PER_OUT_CH_OPTS: for size_m in args.batch_sizes: mkn = (size_m, size_k, size_n) - bench_run(results, model, num_experts, topk, - per_act_token, per_out_ch, mkn) + bench_run( + results, + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) compare = benchmark.Compare(results) compare.print() @@ -378,8 +466,8 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark NVFP4 CUTLASS MOE across specified " - "models/shapes/batches") + description="Benchmark NVFP4 CUTLASS MOE across specified models/shapes/batches" + ) parser.add_argument( "--models", nargs="+", @@ -387,21 +475,14 @@ def main(args): default=DEFAULT_MODELS, choices=WEIGHT_SHAPES_MOE.keys(), ) - parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) parser.add_argument("--limit-k", nargs="+", type=int, default=[]) parser.add_argument("--limit-n", nargs="+", type=int, default=[]) parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-act-token", - nargs="+", - type=int, - default=[]) + parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) args = parser.parse_args() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index c92ea43e8260..2197bceabe6c 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -6,14 +6,18 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8, - fused_experts, - fused_topk) +from vllm.model_executor.layers.fused_moe.fused_moe import ( + cutlass_moe_fp8, + fused_experts, + fused_topk, +) from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = [ - "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite", - "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m" + "nm-testing/Mixtral-8x7B-Instruct-v0.1", + "nm-testing/deepseekv2-lite", + "ibm-granite/granite-3.0-1b-a400m", + "ibm-granite/granite-3.0-3b-a800m", ] DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512] DEFAULT_TP_SIZES = [1] @@ -24,19 +28,27 @@ def to_fp8(tensor: torch.Tensor): finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to( + dtype=torch.float8_e4m3fn + ) -def bench_run(results: list[benchmark.Measurement], model: str, - num_experts: int, topk: int, per_act_token: bool, - per_out_ch: bool, mkn: tuple[int, int, int]): +def bench_run( + results: list[benchmark.Measurement], + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): label = "Quant Matmul" sub_label = ( - "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, " - "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch, - mkn)) + "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, MKN=({})".format( + model, num_experts, topk, per_act_token, per_out_ch, mkn + ) + ) print(f"Testing: {sub_label}") @@ -50,35 +62,17 @@ def bench_run(results: list[benchmark.Measurement], model: str, _, a_scale = ops.scaled_fp8_quant(a) - w1_q = torch.empty((num_experts, 2 * n, k), - device="cuda", - dtype=torch.float8_e4m3fn) - w2_q = torch.empty((num_experts, k, n), - device="cuda", - dtype=torch.float8_e4m3fn) - w1_scale = torch.empty((num_experts, 1, 1), - device="cuda", - dtype=torch.float32) - w2_scale = torch.empty((num_experts, 1, 1), - device="cuda", - dtype=torch.float32) - - ab_strides1 = torch.full((num_experts, ), - k, - device="cuda", - dtype=torch.int64) - c_strides1 = torch.full((num_experts, ), - 2 * n, - device="cuda", - dtype=torch.int64) - ab_strides2 = torch.full((num_experts, ), - n, - device="cuda", - dtype=torch.int64) - c_strides2 = torch.full((num_experts, ), - k, - device="cuda", - dtype=torch.int64) + w1_q = torch.empty( + (num_experts, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn + ) + w2_q = torch.empty((num_experts, k, n), device="cuda", dtype=torch.float8_e4m3fn) + w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) + + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) for expert in range(num_experts): w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) @@ -91,82 +85,120 @@ def bench_run(results: list[benchmark.Measurement], model: str, score = torch.randn((m, num_experts), device="cuda", dtype=dtype) topk_weights, topk_ids, token_expert_indices = fused_topk( - a, score, topk, renormalize=False) + a, score, topk, renormalize=False + ) - def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: torch.Tensor, w2_scale: torch.Tensor, - a_scale: torch.Tensor, num_repeats: int): + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_scale: torch.Tensor, + num_repeats: int, + ): for _ in range(num_repeats): - fused_experts(a, - w1, - w2, - topk_weights, - topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale) - - def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, - w1_scale: torch.Tensor, w2_scale: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, c_strides2: torch.Tensor, - num_repeats: int): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) + + def run_cutlass_moe( + a: torch.Tensor, + a_scale: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides2: torch.Tensor, + num_repeats: int, + ): for _ in range(num_repeats): - cutlass_moe_fp8(a, - w1, - w2, - w1_scale, - w2_scale, - topk_weights, - topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, - a1_scale=a_scale) + cutlass_moe_fp8( + a, + w1, + w2, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + a1_scale=a_scale, + ) def run_cutlass_from_graph( - a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor, - w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, c_strides2: torch.Tensor): + a: torch.Tensor, + a_scale: torch.Tensor, + w1_q: torch.Tensor, + w2_q: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides2: torch.Tensor, + ): with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - return cutlass_moe_fp8(a, - w1_q, - w2_q, - w1_scale, - w2_scale, - topk_weights, - topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, - a1_scale=a_scale) - - def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor, - w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, w1_scale: torch.Tensor, - w2_scale: torch.Tensor, a_scale: torch.Tensor): + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return cutlass_moe_fp8( + a, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + a1_scale=a_scale, + ) + + def run_triton_from_graph( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a_scale: torch.Tensor, + ): with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - return fused_experts(a, - w1, - w2, - topk_weights, - topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale) + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + return fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) def replay_graph(graph, num_repeats): for _ in range(num_repeats): @@ -176,16 +208,35 @@ def replay_graph(graph, num_repeats): cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): - run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, - topk_weights, topk_ids, ab_strides1, c_strides1, - ab_strides2, c_strides2) + run_cutlass_from_graph( + a, + a_scale, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + ) torch.cuda.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(triton_graph, stream=triton_stream): - run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights, - topk_ids, w1_scale, w2_scale, a_scale) + run_triton_from_graph( + a, + w1_q_notransp, + w2_q_notransp, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a_scale, + ) torch.cuda.synchronize() min_run_time = 5 @@ -225,18 +276,27 @@ def replay_graph(graph, num_repeats): } # Warmup - run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, - w1_scale, w2_scale, a_scale, num_warmup) + run_triton_moe( + a, + w1_q_notransp, + w2_q_notransp, + topk_weights, + topk_ids, + w1_scale, + w2_scale, + a_scale, + num_warmup, + ) results.append( benchmark.Timer( - stmt= - "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 + stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="triton_moe", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup replay_graph(triton_graph, num_warmup) @@ -248,22 +308,35 @@ def replay_graph(graph, num_repeats): label=label, sub_label=sub_label, description="triton_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup - run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, - topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, - num_warmup) + run_cutlass_moe( + a, + a_scale, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + num_warmup, + ) results.append( benchmark.Timer( - stmt= - "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="grouped_gemm_moe", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) # Warmup replay_graph(cutlass_graph, num_warmup) @@ -275,7 +348,8 @@ def replay_graph(graph, num_repeats): label=label, sub_label=sub_label, description="grouped_gemm_moe_cuda_graphs", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) def main(args): @@ -303,8 +377,15 @@ def main(args): for per_out_ch in PER_OUT_CH_OPTS: for size_m in DEFAULT_BATCH_SIZES: mkn = (size_m, size_k, size_n) - bench_run(results, model, num_experts, topk, - per_act_token, per_out_ch, mkn) + bench_run( + results, + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) compare = benchmark.Compare(results) compare.print() @@ -312,7 +393,8 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark Marlin across specified models/shapes/batches") + description="Benchmark Marlin across specified models/shapes/batches" + ) parser.add_argument( "--models", nargs="+", @@ -320,21 +402,14 @@ def main(args): default=DEFAULT_MODELS, choices=WEIGHT_SHAPES_MOE.keys(), ) - parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) parser.add_argument("--limit-k", nargs="+", type=int, default=[]) parser.add_argument("--limit-n", nargs="+", type=int, default=[]) parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) - parser.add_argument("--limit-per-act-token", - nargs="+", - type=int, - default=[]) + parser.add_argument("--limit-per-act-token", nargs="+", type=int, default=[]) parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) args = parser.parse_args() diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index e12d74c01e43..f21ca97eeb8a 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -10,14 +10,16 @@ @torch.inference_mode() -def main(num_tokens: int, - hidden_size: int, - add_residual: bool, - dtype: torch.dtype, - seed: int = 0, - do_profile: bool = False, - num_warmup_iters: int = 5, - num_iters: int = 100) -> None: +def main( + num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100, +) -> None: current_platform.seed_everything(seed) torch.set_default_device("cuda") @@ -56,33 +58,35 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: print(f"Kernel running time: {latency * 1000000:.3f} us") -if __name__ == '__main__': - parser = FlexibleArgumentParser( - description="Benchmark the layernorm kernel.") +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the layernorm kernel.") parser.add_argument("--num-tokens", type=int, default=4096) parser.add_argument("--hidden-size", type=int, default=8192) parser.add_argument("--add-residual", action="store_true") - parser.add_argument("--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="half") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--profile", action="store_true") parser.add_argument("--num-warmup-iters", type=int, default=5) - parser.add_argument("--num-iters", - type=int, - default=100, - help="Number of benchmark iterations. " - "If --profile is set, this number is ignored") + parser.add_argument( + "--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored", + ) args = parser.parse_args() print(args) - main(num_tokens=args.num_tokens, - hidden_size=args.hidden_size, - add_residual=args.add_residual, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - num_warmup_iters=args.num_warmup_iters, - num_iters=args.num_iters) + main( + num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + add_residual=args.add_residual, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters, + ) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index d382ede10b41..6c1284930c1e 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -20,18 +20,36 @@ from vllm.triton_utils import HAS_TRITON if HAS_TRITON: - from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand, - lora_shrink) - from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, - _LORA_B_PTR_DICT) + from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink + from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_TP_SIZES = [1] DEFAULT_BATCH_SIZES = [ - 1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024, - 2048, 3072, 4096, 5120, 6144, 7168, 8192 + 1, + 16, + 32, + 64, + 128, + 192, + 256, + 320, + 384, + 448, + 512, + 640, + 768, + 896, + 1024, + 2048, + 3072, + 4096, + 5120, + 6144, + 7168, + 8192, ] DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] DEFAULT_LORA_RANKS = [16] @@ -52,12 +70,9 @@ def dtype_to_str(dtype: torch.dtype): raise ValueError(f"Unsupported dtype {dtype}") -def make_rand_lora_weight_tensor(k: int, - n: int, - num_loras: int, - dtype: torch.dtype, - device: str = "cuda") -> torch.Tensor: - +def make_rand_lora_weight_tensor( + k: int, n: int, num_loras: int, dtype: torch.dtype, device: str = "cuda" +) -> torch.Tensor: # LoRA weights column major return torch.rand((num_loras, n, k), dtype=dtype).to(device) @@ -78,18 +93,15 @@ def make_rand_tensors( A = torch.rand(a_shape, dtype=a_dtype).to(device) # LoRA weights column major - Bs = [ - torch.rand(b_shape, dtype=b_dtype).to(device) - for _ in range(num_slices) - ] + Bs = [torch.rand(b_shape, dtype=b_dtype).to(device) for _ in range(num_slices)] C = torch.zeros(c_shape, dtype=c_dtype).to(device) return A, Bs, C -def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, - sort_by_lora_id: bool, - device: str) -> torch.Tensor: +def make_prompt_lora_mapping( + num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str +) -> torch.Tensor: """ All prompts are mapped to a LoRA ID in range [0, num_active_loras). where 0 refers to first lora, 1 refers to second lora and so on. @@ -97,9 +109,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, assert num_active_loras > 0 if not sort_by_lora_id: - return torch.randint(0, - num_active_loras, (num_prompts, ), - dtype=torch.long) + return torch.randint(0, num_active_loras, (num_prompts,), dtype=torch.long) # Divide LoRAs equally and in order. part_size = num_prompts // num_active_loras @@ -110,14 +120,18 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, while len(prompt_lora_mapping) < num_prompts: prompt_lora_mapping.extend([lora_id] * part_size) lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id - return torch.tensor(prompt_lora_mapping[:num_prompts], - dtype=torch.long, - device=device) - - -def make_token_lora_mapping(num_tokens: int, num_prompts: int, - prompt_lora_mapping: torch.Tensor, - seq_len_tensor: torch.Tensor, device: str): + return torch.tensor( + prompt_lora_mapping[:num_prompts], dtype=torch.long, device=device + ) + + +def make_token_lora_mapping( + num_tokens: int, + num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, + device: str, +): """ Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor """ @@ -136,11 +150,15 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int, return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) -def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, - lora_weights: list[torch.Tensor], - seq_lens_cpu: torch.Tensor, - prompt_lora_mapping_cpu: torch.Tensor, scaling: float, - add_inputs: Optional[bool]): +def ref_group_gemm( + ref_out: torch.Tensor, + input: torch.Tensor, + lora_weights: list[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, + scaling: float, + add_inputs: Optional[bool], +): """ Torch group gemm reference implementation to test correctness of benchmarking operations. @@ -149,7 +167,7 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, out_list = [] current_offset = 0 for lora_index, b_length in zip(range(batches), seq_lens_cpu): - x = input[current_offset:b_length + current_offset, :] + x = input[current_offset : b_length + current_offset, :] current_offset += b_length w = lora_weights[prompt_lora_mapping_cpu[lora_index]] result = torch.nn.functional.linear(x, w) @@ -168,6 +186,7 @@ class OpType(Enum): """ LoRA Ops to benchmark and its properties. """ + LORA_SHRINK = auto() LORA_EXPAND = auto() @@ -188,8 +207,9 @@ def is_expand_fn(self) -> bool: def num_slices(self) -> list[int]: return [1, 2, 3] - def mkn(self, batch_size: int, seq_length: int, hidden_size: int, - lora_rank: int) -> tuple[int, int, int]: + def mkn( + self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int + ) -> tuple[int, int, int]: num_tokens = batch_size * seq_length if self.is_shrink_fn(): m = num_tokens @@ -203,7 +223,7 @@ def mkn(self, batch_size: int, seq_length: int, hidden_size: int, return m, k, n def matmul_dtypes( - self, op_dtype: torch.dtype + self, op_dtype: torch.dtype ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: """ return a type, b type and c type for A x B = C @@ -215,9 +235,14 @@ def matmul_dtypes( return torch.float32, op_dtype, op_dtype def matmul_shapes( - self, batch_size: int, seq_length: int, hidden_size: int, - lora_rank: int, num_loras: int, - num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]: + self, + batch_size: int, + seq_length: int, + hidden_size: int, + lora_rank: int, + num_loras: int, + num_slices: int, + ) -> tuple[tuple[int], tuple[int], tuple[int]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -241,31 +266,38 @@ def bench_fn(self) -> Callable: raise ValueError(f"Unrecognized optype {self}") - def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, - lora_weights: list[torch.Tensor], - **kwargs) -> Callable: + def run_ref_group_gemm( + self, + output: torch.Tensor, + input: torch.Tensor, + lora_weights: list[torch.Tensor], + **kwargs, + ) -> Callable: """Each benchmark operation expects the input, lora_weights and outputs - in a slightly different format. Refer to self.matmul_shapes(). - run_ref_group_gemm accounts for those differences in executing a - reference group gemm for correctness testing. + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. """ w_dtype = lora_weights[0].dtype num_slices = len(lora_weights) if self in [OpType.LORA_SHRINK]: for slice_idx in range(num_slices): - ref_group_gemm(ref_out=output[slice_idx, :], - input=input, - lora_weights=lora_weights[slice_idx], - **kwargs) + ref_group_gemm( + ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs, + ) elif self in [OpType.LORA_EXPAND]: hidden_size = lora_weights[0].shape[1] for slice_idx in range(num_slices): slice_offset = slice_idx * hidden_size ref_group_gemm( - ref_out=output[:, slice_offset:slice_offset + hidden_size], + ref_out=output[:, slice_offset : slice_offset + hidden_size], input=input[slice_idx].clone().to(dtype=w_dtype), lora_weights=lora_weights[slice_idx], - **kwargs) + **kwargs, + ) else: raise ValueError(f"Unrecognized optype {self}") @@ -275,6 +307,7 @@ class BenchmarkContext: """ LoRA benchmark context """ + batch_size: int hidden_size: int num_loras: int @@ -299,17 +332,18 @@ def bench_label(self) -> str: return f"lora-{self.dtype}" def bench_sublabel(self, op_type: OpType) -> str: - m, k, n = op_type.mkn(self.batch_size, self.seq_length, - self.hidden_size, self.lora_rank) + m, k, n = op_type.mkn( + self.batch_size, self.seq_length, self.hidden_size, self.lora_rank + ) desc = { - 'bs': self.batch_size, - 'sl': self.seq_length, - 'm': m, - 'k': k, - 'n': n, - 'num_loras': self.num_loras, - 'sort_by_lora': self.sort_by_lora_id, - 'num_slices': self.num_slices, + "bs": self.batch_size, + "sl": self.seq_length, + "m": m, + "k": k, + "n": n, + "num_loras": self.num_loras, + "sort_by_lora": self.sort_by_lora_id, + "num_slices": self.num_slices, } return json.dumps(desc) @@ -319,6 +353,7 @@ class BenchmarkTensors: """ Input/Output tensors used for benchmarks """ + # matmul tensors input: torch.Tensor lora_weights_lst: list[torch.Tensor] @@ -330,23 +365,29 @@ class BenchmarkTensors: prompt_lora_mapping: torch.Tensor def io_types(self) -> str: - return (f"{dtype_to_str(self.input.dtype)}x" - f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" - f"{dtype_to_str(self.output.dtype)}") + return ( + f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}" + ) @staticmethod - def make(ctx: BenchmarkContext, - op_type: OpType, - device: str = "cuda") -> "BenchmarkTensors": - + def make( + ctx: BenchmarkContext, op_type: OpType, device: str = "cuda" + ) -> "BenchmarkTensors": # Make input / output matmul tensors. a_shape, b_shape, c_shape = op_type.matmul_shapes( - ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank, - ctx.num_loras, ctx.num_slices) + ctx.batch_size, + ctx.seq_length, + ctx.hidden_size, + ctx.lora_rank, + ctx.num_loras, + ctx.num_slices, + ) a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) - input_tensor, lora_weights, output_tensor = \ - make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type, - num_slices = ctx.num_slices) + input_tensor, lora_weights, output_tensor = make_rand_tensors( + a_shape, b_shape, c_shape, a_type, b_type, c_type, num_slices=ctx.num_slices + ) # Make metadata tensors. # Keep the metadata tensors in the CPU for further processing if needed. @@ -356,27 +397,38 @@ def make(ctx: BenchmarkContext, # Make metadata tensors involved in correctness testing. # Prepare seq lens tensor - seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, - (ctx.batch_size, )) + seq_len_tensor = torch.randint( + ctx.seq_length, ctx.seq_length + 1, (ctx.batch_size,) + ) assert total_tokens == seq_len_tensor.sum() # Prepare prompt lora indices tensor prompt_lora_indices_tensor = make_prompt_lora_mapping( - ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu" + ) # Make LoRAKernelMeta token_lora_indices_tensor = make_token_lora_mapping( - total_tokens, ctx.batch_size, prompt_lora_indices_tensor, - seq_len_tensor, "cpu") + total_tokens, + ctx.batch_size, + prompt_lora_indices_tensor, + seq_len_tensor, + "cpu", + ) lora_kernel_meta = LoRAKernelMeta.make( max_loras=ctx.num_loras, max_num_tokens=token_lora_indices_tensor.size(0), - device="cpu") - lora_kernel_meta.prepare_tensors( - token_lora_mapping=token_lora_indices_tensor) - - return BenchmarkTensors(input_tensor, lora_weights, output_tensor, - lora_kernel_meta, seq_len_tensor, - prompt_lora_indices_tensor) + device="cpu", + ) + lora_kernel_meta.prepare_tensors(token_lora_mapping=token_lora_indices_tensor) + + return BenchmarkTensors( + input_tensor, + lora_weights, + output_tensor, + lora_kernel_meta, + seq_len_tensor, + prompt_lora_indices_tensor, + ) def sanity_check(self) -> None: """ @@ -386,7 +438,7 @@ def sanity_check(self) -> None: # check metadata tensors assert torch.sum(self.seq_lens) == num_tokens num_seqs = self.seq_lens.shape[0] - #assert self.seq_start_loc.shape[0] == num_seqs + # assert self.seq_start_loc.shape[0] == num_seqs assert self.prompt_lora_mapping.shape[0] == num_seqs assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens @@ -430,8 +482,11 @@ def as_lora_shrink_kwargs(self) -> dict[str, Any]: _, num_tokens, _, num_slices = self.metadata() # Sanity check matrix shapes. - i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ - 0].shape, self.output.shape + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) # Expected input shape [num_tokens, hidden_size] assert len(i_shape) == 2 assert i_shape[0] == num_tokens @@ -445,16 +500,17 @@ def as_lora_shrink_kwargs(self) -> dict[str, Any]: assert o_shape == (num_slices, num_tokens, lora_rank) return { - 'inputs': self.input, - 'lora_a_weights': self.lora_weights_lst, - 'output_tensor': self.output, - 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, - 'token_indices_sorted_by_lora_ids': - self.lora_kernel_meta.token_indices_sorted_by_lora_ids, - 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, - 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, - 'lora_ids': self.lora_kernel_meta.active_lora_ids, - 'scaling': 1.0, + "inputs": self.input, + "lora_a_weights": self.lora_weights_lst, + "output_tensor": self.output, + "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, + "token_indices_sorted_by_lora_ids": ( + self.lora_kernel_meta.token_indices_sorted_by_lora_ids + ), + "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, + "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, + "lora_ids": self.lora_kernel_meta.active_lora_ids, + "scaling": 1.0, } def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: @@ -464,8 +520,11 @@ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: _, num_tokens, _, num_slices = self.metadata() # Sanity check matrix shapes. - i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ - 0].shape, self.output.shape + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) # Expected input shape : [num_slices, num_tokens, lora_rank] assert len(i_shape) == 3 assert i_shape[0] == num_slices @@ -480,22 +539,23 @@ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: assert o_shape == (num_tokens, hidden_size * num_slices) return { - 'inputs': self.input, - 'lora_b_weights': self.lora_weights_lst, - 'output_tensor': self.output, - 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, - 'token_indices_sorted_by_lora_ids': - self.lora_kernel_meta.token_indices_sorted_by_lora_ids, - 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, - 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, - 'lora_ids': self.lora_kernel_meta.active_lora_ids, - 'offset_start': 0, - 'add_inputs': add_inputs, + "inputs": self.input, + "lora_b_weights": self.lora_weights_lst, + "output_tensor": self.output, + "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping, + "token_indices_sorted_by_lora_ids": ( + self.lora_kernel_meta.token_indices_sorted_by_lora_ids + ), + "num_tokens_per_lora": self.lora_kernel_meta.num_tokens_per_lora, + "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, + "lora_ids": self.lora_kernel_meta.active_lora_ids, + "offset_start": 0, + "add_inputs": add_inputs, } - def bench_fn_kwargs(self, - op_type: OpType, - add_inputs: Optional[bool] = None) -> dict[str, Any]: + def bench_fn_kwargs( + self, op_type: OpType, add_inputs: Optional[bool] = None + ) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None else: @@ -507,8 +567,9 @@ def bench_fn_kwargs(self, return self.as_lora_expand_kwargs(add_inputs) raise ValueError(f"Unrecognized optype {self}") - def test_correctness(self, op_type: OpType, - expand_fn_add_inputs: Optional[bool]) -> bool: + def test_correctness( + self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + ) -> bool: """ Test correctness of op_type implementation against a grouped gemm reference implementation. @@ -518,8 +579,7 @@ def test_correctness(self, op_type: OpType, ref_output = self.output.clone() self.output.zero_() - op_type.bench_fn()( - **self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + op_type.bench_fn()(**self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) op_type.run_ref_group_gemm( ref_output, @@ -528,7 +588,8 @@ def test_correctness(self, op_type: OpType, seq_lens_cpu=seq_lens_cpu, prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, scaling=1.0, - add_inputs=expand_fn_add_inputs) + add_inputs=expand_fn_add_inputs, + ) rtol, atol = { torch.float16: (6e-2, 6e-2), @@ -539,13 +600,14 @@ def test_correctness(self, op_type: OpType, return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) -def bench_optype(ctx: BenchmarkContext, - arg_pool_size: int, - op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, - test_correctness: bool = False) -> TMeasurement: - +def bench_optype( + ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False, +) -> TMeasurement: assert arg_pool_size >= 1 if op_type.is_shrink_fn(): assert expand_fn_add_inputs is None @@ -553,17 +615,17 @@ def bench_optype(ctx: BenchmarkContext, assert expand_fn_add_inputs is not None # BenchmarkContext -> BenchmarkTensors - bench_tensors : list[BenchmarkTensors] = \ - [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] + bench_tensors: list[BenchmarkTensors] = [ + BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size) + ] for bt in bench_tensors: bt.sanity_check() # Test correctness of our implementation. if test_correctness: - assert all([ - bt.test_correctness(op_type, expand_fn_add_inputs) - for bt in bench_tensors - ]) + assert all( + [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors] + ) # BenchmarkTensors -> dict (kwargs) kwargs_list = [ @@ -585,40 +647,49 @@ def bench_optype(ctx: BenchmarkContext, for k, v in _kwargs.items(): kwargs[k].values.append(v) - describe_args = (f"add_inputs={expand_fn_add_inputs}" - if expand_fn_add_inputs is not None else "") - description = ( - f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})") + describe_args = ( + f"add_inputs={expand_fn_add_inputs}" if expand_fn_add_inputs is not None else "" + ) + description = f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})" cuda_graph_params = None if cuda_graph_nops: cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) timer = None - with Bench(cuda_graph_params, - ctx.bench_label(), ctx.bench_sublabel(op_type), description, - op_type.bench_fn(), **kwargs) as bench: + with Bench( + cuda_graph_params, + ctx.bench_label(), + ctx.bench_sublabel(op_type), + description, + op_type.bench_fn(), + **kwargs, + ) as bench: timer = bench.run() return timer -def bench_torch_mm(ctx: BenchmarkContext, - arg_pool_size: int, - op_type: OpType, - cuda_graph_nops: Optional[int] = None) -> TMeasurement: +def bench_torch_mm( + ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, +) -> TMeasurement: """ Benchmark basic torch.mm as a roofline. When all the input tokens have the same LoRA ID, the LoRA kernels are just - a matmul. This torch.mm benchmark serves as a roofline for that case. + a matmul. This torch.mm benchmark serves as a roofline for that case. input op_type is used in determining the m, k, n dimensions for the matmul. """ - batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size, - ctx.hidden_size, - ctx.lora_rank, - ctx.seq_length, - ctx.dtype) + batch_size, hidden_size, lora_rank, seq_length, dtype = ( + ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype, + ) m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) # For a fairer comparison. @@ -632,18 +703,24 @@ def bench_torch_mm(ctx: BenchmarkContext, Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) # Make torch.mm kwargs - mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)} + mm_kwargs = {"input": ArgPool(As), "mat2": ArgPool(Bs), "out": ArgPool(Cs)} description = ( f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" f"x{dtype_to_str(dtype)}" - f"=>{dtype_to_str(dtype)})") + f"=>{dtype_to_str(dtype)})" + ) cuda_graph_params = None if cuda_graph_nops: cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) - with Bench(cuda_graph_params, ctx.bench_label(), - ctx.bench_sublabel(op_type), description, torch.mm, - **mm_kwargs) as bench: + with Bench( + cuda_graph_params, + ctx.bench_label(), + ctx.bench_sublabel(op_type), + description, + torch.mm, + **mm_kwargs, + ) as bench: return bench.run() @@ -660,8 +737,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: list[TMeasurement], - args: Optional[argparse.Namespace] = None): +def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): compare = TBenchmark.Compare(timers) compare.print() @@ -670,22 +746,23 @@ def print_timers(timers: list[TMeasurement], f"Note : The timings reported above is for {args.cuda_graph_nops} " "consecutive invocations of the benchmarking functions. " f"Please divide by {args.cuda_graph_nops} for single invocation " - "timings.") + "timings." + ) - print("Note on Comparison with torch.mm : The torch.mm numbers are " - "benchmark numbers of a simple matmul emulating the single lora " - "case. It is provided as a roofline for comparing our LoRA Kernel " - "implementations. It is expected that the LoRA kernels will be " - "slower than torch.mm in cases where num_loras is big. But for " - "small num_loras the goal should be to match the torch.mm numbers.") + print( + "Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers." + ) def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): - if args.cuda_graph_nops is not None: assert args.cuda_graph_nops > 0 - print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA " - "Graph") + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA Graph") else: print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") @@ -697,21 +774,30 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): for bench_op in bench_ops: for num_slices in bench_op.num_slices(): _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( - num_slices) + num_slices + ) # Benchmark torch.mm as a roofline seq_len_timers.append( - bench_torch_mm(_ctx, args.arg_pool_size, bench_op, - args.cuda_graph_nops)) + bench_torch_mm( + _ctx, args.arg_pool_size, bench_op, args.cuda_graph_nops + ) + ) # Benchmark bench_op - expand_fn_add_inputs = [ - None - ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + expand_fn_add_inputs = ( + [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + ) for add_input_arg in expand_fn_add_inputs: seq_len_timers.append( - bench_optype(_ctx, args.arg_pool_size, bench_op, - args.cuda_graph_nops, add_input_arg, - args.test_correctness)) + bench_optype( + _ctx, + args.arg_pool_size, + bench_op, + args.cuda_graph_nops, + add_input_arg, + args.test_correctness, + ) + ) print_timers(seq_len_timers) timers.extend(seq_len_timers) @@ -733,13 +819,17 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): pickle.dump(timers, f) -def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], - args: argparse.Namespace) -> list[BenchmarkContext]: - +def as_benchmark_contexts( + hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace +) -> list[BenchmarkContext]: ctxs: list[BenchmarkContext] = [] for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa - args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, - args.sort_by_lora_id): + args.batch_sizes, + list(hidden_sizes), + lora_ranks, + args.num_loras, + args.sort_by_lora_id, + ): ctxs.append( BenchmarkContext( batch_size=batch_size, @@ -747,13 +837,16 @@ def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], lora_rank=lora_rank, num_loras=num_loras, num_active_loras=args.num_active_loras - if args.num_active_loras else num_loras, + if args.num_active_loras + else num_loras, # To be filled based on the OpType to benchmark seq_length=None, sort_by_lora_id=sort_by_lora_id, dtype=args.dtype, # To be filled based on the OpType to benchmark - num_slices=None)) + num_slices=None, + ) + ) return ctxs @@ -761,13 +854,16 @@ def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], def run_list_bench(args: argparse.Namespace): print(args) - print("List bench :\n" - f" Hidden Sizes {args.hidden_sizes}" - f" LoRA Ranks {args.lora_ranks}") + print( + "List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}" + ) # Get all benchmarking contexts bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args + ) run(args, bench_contexts) @@ -776,19 +872,22 @@ def run_range_bench(args: argparse.Namespace): print(args) hidden_sizes = list( - range(args.hidden_sizes_start, args.hidden_sizes_end + 1, - args.hidden_sizes_increment)) + range( + args.hidden_sizes_start, + args.hidden_sizes_end + 1, + args.hidden_sizes_increment, + ) + ) lora_ranks = list( - range(args.lora_ranks_start, args.lora_ranks_end + 1, - args.lora_ranks_increment)) + range(args.lora_ranks_start, args.lora_ranks_end + 1, args.lora_ranks_increment) + ) - print("Range bench :\n" - f" Hidden Sizes {hidden_sizes}" - f" LoRA Ranks {lora_ranks}") + print(f"Range bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {lora_ranks}") # Get all benchmarking contexts bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args + ) run(args, bench_contexts) @@ -806,21 +905,19 @@ def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: # Get all hidden sizes hidden_sizes: set[int] = set() for model_name, tp_size in product(args.models, args.tp_sizes): - hidden_sizes = hidden_sizes.union( - hidden_sizes_from_model(model_name, tp_size)) + hidden_sizes = hidden_sizes.union(hidden_sizes_from_model(model_name, tp_size)) - print("Model bench :\n" - f" Hidden Sizes {hidden_sizes}" - f" LoRA Ranks {args.lora_ranks}") + print(f"Model bench :\n Hidden Sizes {hidden_sizes} LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( - hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args + ) run(args, bench_contexts) -if __name__ == '__main__': +if __name__ == "__main__": def to_torch_dtype(dt): if dt == "torch.float16": @@ -830,14 +927,15 @@ def to_torch_dtype(dt): raise ValueError("unsupported dtype") def get_bool(s: str) -> bool: - return s.lower() in ['true', '1'] + return s.lower() in ["true", "1"] def add_common_command_args(p: argparse.ArgumentParser): p.add_argument( "--dtype", type=to_torch_dtype, required=True, - help="Available options are ['torch.float16', 'torch.bfloat16']") + help="Available options are ['torch.float16', 'torch.bfloat16']", + ) p.add_argument( "--arg-pool-size", @@ -845,56 +943,66 @@ def add_common_command_args(p: argparse.ArgumentParser): default=32, help="Run profiles with a pool of input/output/meta tensors instead" "of simply reusing the same tensors for all runs. A bigger arg-pool" - "mitigates hardware caching effects during benchmarking.") + "mitigates hardware caching effects during benchmarking.", + ) p.add_argument( "--cuda-graph-nops", type=int, - help=("when set profiling is done using cudagraph, " - "with the given number of operations in a graph." - "Note that the measurement returned is the time " - "taken for N consecutive executions of the benchmarking " - "functions, where N is the value of this argument.")) - p.add_argument("--num-loras", - nargs="+", - type=int, - default=DEFAULT_NUM_LORAS) - p.add_argument("--num-active-loras", - type=int, - default=None, - help="Active LoRAs. When None, all LoRAs are active") - p.add_argument("--sort-by-lora-id", - nargs="+", - type=get_bool, - default=DEFAULT_SORT_BY_LORA_IDS) - p.add_argument("--op-types", - nargs="+", - type=OpType.from_str, - default=list(OpType)) - p.add_argument('--seq-lengths', - nargs="+", - type=int, - default=DEFAULT_SEQ_LENGTHS) - p.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) - p.add_argument("--expand-fn-add-inputs", - nargs="+", - type=get_bool, - default=DEFAULT_EXPAND_FN_ADD_INPUTS) + help=( + "when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument." + ), + ) + p.add_argument("--num-loras", nargs="+", type=int, default=DEFAULT_NUM_LORAS) + p.add_argument( + "--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active", + ) + p.add_argument( + "--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS, + ) + p.add_argument( + "--op-types", nargs="+", type=OpType.from_str, default=list(OpType) + ) + p.add_argument( + "--seq-lengths", nargs="+", type=int, default=DEFAULT_SEQ_LENGTHS + ) + p.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + p.add_argument( + "--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS, + ) p.add_argument( - '-o', - '--output-directory', + "-o", + "--output-directory", type=str, - help=("Output directory to store a the list of benchmarking" - "TMeasurement objects as a pickle file")) + help=( + "Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file" + ), + ) p.add_argument( "--test-correctness", - action='store_true', - help=("When enabled, the benchmarking functions are tested" - "for correctness before the actual benchmarking")) + action="store_true", + help=( + "When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking" + ), + ) parser = FlexibleArgumentParser( description=f""" @@ -910,50 +1018,45 @@ def add_common_command_args(p: argparse.ArgumentParser): range_bench example: python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 """, # noqa: E501 - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) subparsers = parser.add_subparsers(dest="cmd", required=True) list_parser = subparsers.add_parser("list_bench") - list_parser.add_argument("--hidden-sizes", - nargs="+", - type=int, - default=DEFAULT_HIDDEN_SIZES) - list_parser.add_argument("--lora-ranks", - nargs="+", - type=int, - default=DEFAULT_LORA_RANKS) + list_parser.add_argument( + "--hidden-sizes", nargs="+", type=int, default=DEFAULT_HIDDEN_SIZES + ) + list_parser.add_argument( + "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS + ) add_common_command_args(list_parser) list_parser.set_defaults(func=run_list_bench) range_parser = subparsers.add_parser("range_bench") range_parser.add_argument("--hidden-sizes-start", type=int, required=True) range_parser.add_argument("--hidden-sizes-end", type=int, required=True) - range_parser.add_argument("--hidden-sizes-increment", - type=int, - required=True) + range_parser.add_argument("--hidden-sizes-increment", type=int, required=True) range_parser.add_argument("--lora-ranks-start", type=int, required=True) range_parser.add_argument("--lora-ranks-end", type=int, required=True) - range_parser.add_argument("--lora-ranks-increment", - type=int, - required=True) + range_parser.add_argument("--lora-ranks-increment", type=int, required=True) add_common_command_args(range_parser) range_parser.set_defaults(func=run_range_bench) model_parser = subparsers.add_parser("model_bench") - model_parser.add_argument("--models", - nargs="+", - type=str, - default=DEFAULT_MODELS, - choices=WEIGHT_SHAPES.keys()) - model_parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - model_parser.add_argument("--lora-ranks", - nargs="+", - type=int, - default=DEFAULT_LORA_RANKS) + model_parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys(), + ) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--lora-ranks", nargs="+", type=int, default=DEFAULT_LORA_RANKS + ) add_common_command_args(model_parser) model_parser.set_defaults(func=run_model_bench) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index a661ea9d7e60..f8f1db04790b 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -20,12 +20,18 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales, - marlin_zero_points) + GPTQ_MARLIN_MAX_PARALLEL, + GPTQ_MARLIN_MIN_THREAD_N, + marlin_permute_scales, + marlin_zero_points, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( - MarlinWorkspace) + MarlinWorkspace, +) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - pack_rows, quantize_weights) + pack_rows, + quantize_weights, +) from vllm.scalar_type import ScalarType, scalar_types from vllm.utils import FlexibleArgumentParser @@ -82,12 +88,14 @@ def rand_data(shape, dtype=torch.float16, scale=1): return torch.randint(-15, 15, shape, dtype=dtype, device="cuda") -def quantize_and_pack(atype: torch.dtype, - w: torch.Tensor, - wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], - zero_points: bool = False): +def quantize_and_pack( + atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False, +): assert wtype.is_integer(), "TODO: support floating point weights" w_ref, w_q, w_s, w_zp = quantize_weights( @@ -96,21 +104,24 @@ def quantize_and_pack(atype: torch.dtype, group_size=group_size, zero_points=zero_points, # to match how the kernel applies zps - ref_zero_points_after_scales=True) + ref_zero_points_after_scales=True, + ) w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) return w_ref, w_q, w_s, w_zp -def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, - group_size: Optional[int]) -> list[BenchmarkTensors]: +def create_bench_tensors( + shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] +) -> list[BenchmarkTensors]: m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so # we construct enough weights to exceed L2 cache, which is 50mb on a H100 # so we target total weight size > 2*50mb - num_weights = math.ceil(2 * 50 * 1024**2 * 8 / - (k * n * types.weight_type.size_bits)) + num_weights = math.ceil( + 2 * 50 * 1024**2 * 8 / (k * n * types.weight_type.size_bits) + ) a = rand_data((m, k), types.act_type, scale=5) @@ -124,8 +135,13 @@ def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, w = w.to(torch.float16) w_ref, w_q_packed, w_s, w_zp = quantize_and_pack( - a.dtype, w, types.weight_type, types.group_scale_type, group_size, - types.group_zero_type is not None) + a.dtype, + w, + types.weight_type, + types.group_scale_type, + group_size, + types.group_zero_type is not None, + ) if not a.dtype.is_floating_point: aiinfo = torch.iinfo(a.dtype) @@ -133,21 +149,30 @@ def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, w_ref = w_ref.to(torch.float32) - w_ch_s = None if types.channel_scale_type is None else\ - rand_data((n,), types.channel_scale_type) - w_tok_s = None if types.token_scale_type is None else\ - rand_data((m,), types.token_scale_type) + w_ch_s = ( + None + if types.channel_scale_type is None + else rand_data((n,), types.channel_scale_type) + ) + w_tok_s = ( + None + if types.token_scale_type is None + else rand_data((m,), types.token_scale_type) + ) benchmark_tensors.append( - BenchmarkTensors(w_ref=w_ref, - a=a, - w_q=w_q_packed, - wtype=types.weight_type, - w_g_s=w_s, - w_g_zp=w_zp, - group_size=group_size, - w_ch_s=w_ch_s, - w_tok_s=w_tok_s)) + BenchmarkTensors( + w_ref=w_ref, + a=a, + w_q=w_q_packed, + wtype=types.weight_type, + w_g_s=w_s, + w_g_zp=w_zp, + group_size=group_size, + w_ch_s=w_ch_s, + w_tok_s=w_tok_s, + ) + ) return benchmark_tensors @@ -170,50 +195,57 @@ def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable: scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t() return lambda: ops.cutlass_scaled_mm( - bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16) + bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16 + ) def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: device = bt.a.device - workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL) + workspace = MarlinWorkspace( + bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL + ) if bt.w_g_zp is None: w_zp = torch.empty(0, dtype=torch.int, device=device) else: - w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0], - bt.w_ref.shape[1], bt.wtype.size_bits) + w_zp = marlin_zero_points( + bt.w_g_zp, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits + ) if bt.group_size is None: w_s = torch.tensor([], device="cuda", dtype=torch.half) else: - w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0], - bt.w_ref.shape[1], bt.group_size) + w_s = marlin_permute_scales( + bt.w_g_s, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.group_size + ) sort_indices = torch.empty(0, dtype=torch.int, device=device) g_idx = torch.empty(0, dtype=torch.int, device=device) - w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0], - bt.w_ref.shape[1], bt.wtype.size_bits) + w_q = ops.gptq_marlin_repack( + bt.w_q, sort_indices, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits + ) if bt.a.dtype.is_floating_point: assert bt.w_ch_s is None assert bt.w_tok_s is None assert bt.group_size is not None - fn = lambda: ops.gptq_marlin_gemm(a=bt.a, - b_q_weight=w_q, - b_scales=w_s, - b_zeros=w_zp, - g_idx=g_idx, - perm=sort_indices, - workspace=workspace.scratch, - b_q_type=bt.wtype, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - is_k_full=True, - is_zp_float=False) + fn = lambda: ops.gptq_marlin_gemm( + a=bt.a, + b_q_weight=w_q, + b_scales=w_s, + b_zeros=w_zp, + g_idx=g_idx, + perm=sort_indices, + workspace=workspace.scratch, + b_q_type=bt.wtype, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + is_k_full=True, + is_zp_float=False, + ) else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 @@ -221,36 +253,35 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: if bt.w_ch_s is not None: s_ch = bt.w_ch_s.to(torch.float32) else: - s_ch = torch.ones(bt.w_ref.shape[1], - dtype=torch.float32, - device=device) + s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) if bt.w_tok_s is not None: s_tok = bt.w_tok_s.to(torch.float32) else: - s_tok = torch.ones(bt.a.shape[0], - dtype=torch.float32, - device=device) - - fn = lambda: ops.marlin_qqq_gemm(a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0]) + s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) + + fn = lambda: ops.marlin_qqq_gemm( + a=bt.a, + b_q_weight=w_q, + s_group=w_s, + s_tok=s_tok, + s_ch=s_ch, + workspace=workspace.scratch, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + ) return fn -def machete_create_bench_fn(bt: BenchmarkTensors, - out_type=torch.dtype, - schedule=None) -> Callable: +def machete_create_bench_fn( + bt: BenchmarkTensors, out_type=torch.dtype, schedule=None +) -> Callable: w_q = bt.w_q.t().contiguous().t() # make col major - w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype, - None if bt.w_g_s is None else bt.w_g_s.dtype) + w_q = ops.machete_prepack_B( + w_q, bt.a.dtype, bt.wtype, None if bt.w_g_s is None else bt.w_g_s.dtype + ) w_g_zp = bt.w_g_zp if w_g_zp is not None: @@ -275,26 +306,24 @@ def machete_create_bench_fn(bt: BenchmarkTensors, # bench -def bench_fns(label: str, sub_label: str, description: str, - fns: list[Callable]): - +def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]): min_run_time = 1 if not NVTX_PROFILE else 0.1 res = TBenchmark.Timer( stmt=""" for fn in fns: fn() """, - globals={ - "fns": fns - }, + globals={"fns": fns}, label=label, sub_label=sub_label, description=description, ).blocked_autorange(min_run_time=min_run_time) if NVTX_PROFILE: - with nvtx.annotate("mm-bench"), nvtx.annotate( - f"{label}|{sub_label}|{description}"): + with ( + nvtx.annotate("mm-bench"), + nvtx.annotate(f"{label}|{sub_label}|{description}"), + ): fns[0]() return res @@ -304,19 +333,20 @@ def bench_fns(label: str, sub_label: str, description: str, _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None -def bench(types: TypeConfig, - group_size: int, - m: int, - k: int, - n: int, - label: str, - sub_label: str, - sweep_schedules: bool = True) -> list[TMeasurement]: +def bench( + types: TypeConfig, + group_size: int, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + sweep_schedules: bool = True, +) -> list[TMeasurement]: benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) sub_label += f", L={len(benchmark_tensors)}" - name_type_string = f"W{types.weight_type}"+\ - f"-A{terse_type_name(types.act_type)}" + name_type_string = f"W{types.weight_type}" + f"-A{terse_type_name(types.act_type)}" if types.group_scale_type is not None: name_type_string += f"-GS{terse_type_name(types.group_scale_type)}" if types.group_zero_type is not None: @@ -332,31 +362,45 @@ def bench(types: TypeConfig, # pytorch impl timers.append( bench_fns( - label, sub_label, "torch.matmul (fp16)", - [torch_matmul_f16_create_bench_fn(bt) - for bt in benchmark_tensors])) + label, + sub_label, + "torch.matmul (fp16)", + [torch_matmul_f16_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn: timers.append( bench_fns( - label, sub_label, - f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [ - cutlass_scaled_mm_create_bench_fn(bt) - for bt in benchmark_tensors - ])) + label, + sub_label, + f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", + [cutlass_scaled_mm_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) if types.act_type != torch.float8_e4m3fn: timers.append( - bench_fns(label, sub_label, f"marlin ({name_type_string})", - [marlin_create_bench_fn(bt) - for bt in benchmark_tensors])) + bench_fns( + label, + sub_label, + f"marlin ({name_type_string})", + [marlin_create_bench_fn(bt) for bt in benchmark_tensors], + ) + ) # machete timers.append( - bench_fns(label, sub_label, f"machete ({name_type_string})", [ - machete_create_bench_fn(bt, out_type=types.output_type) - for bt in benchmark_tensors - ])) + bench_fns( + label, + sub_label, + f"machete ({name_type_string})", + [ + machete_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ], + ) + ) if sweep_schedules: global _SWEEP_SCHEDULES_RESULTS @@ -371,7 +415,8 @@ def bench(types: TypeConfig, group_zeros_type=types.group_zero_type, token_scales_type=types.token_scale_type, channel_scales_type=types.channel_scale_type, - out_type=types.output_type) + out_type=types.output_type, + ) if schedules is None or len(schedules) == 0: raise ValueError("No schedules found to sweep") @@ -383,11 +428,17 @@ def bench(types: TypeConfig, if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: continue - res = bench_fns(label, sub_label, "machete_best", [ - machete_create_bench_fn( - bt, out_type=types.output_type, schedule=schedule) - for bt in benchmark_tensors - ]) + res = bench_fns( + label, + sub_label, + "machete_best", + [ + machete_create_bench_fn( + bt, out_type=types.output_type, schedule=schedule + ) + for bt in benchmark_tensors + ], + ) results_row = { "M": m, @@ -398,10 +449,8 @@ def bench(types: TypeConfig, "median": res.median, } if _SWEEP_SCHEDULES_RESULTS is None: - _SWEEP_SCHEDULES_RESULTS = pd.DataFrame( - columns=results_row.keys()) - _SWEEP_SCHEDULES_RESULTS.\ - loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row + _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(columns=results_row.keys()) + _SWEEP_SCHEDULES_RESULTS.loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row print(f" {res.median:5.5} ", schedule) if not best or res.median < best.median: @@ -422,8 +471,9 @@ def print_timers(timers: list[TMeasurement]): def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: types = TypeConfig( act_type=args.act_type, - weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ - else scalar_types.uint4, + weight_type=scalar_types.uint4b8 + if args.group_zero_type is None + else scalar_types.uint4, output_type=args.out_type, group_scale_type=args.group_scale_type, group_zero_type=args.group_zero_type, @@ -433,14 +483,16 @@ def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: results: list[TMeasurement] = [] for m, k, n in MKNs: - timers = bench(types, - args.group_size, - m, - k, - n, - f"{args.act_type}-gemm", - f"MKN=({m}x{k}x{n})", - sweep_schedules=args.sweep_schedules) + timers = bench( + types, + args.group_size, + m, + k, + n, + f"{args.act_type}-gemm", + f"MKN=({m}x{k}x{n})", + sweep_schedules=args.sweep_schedules, + ) print_timers(timers) results.extend(timers) @@ -454,7 +506,6 @@ def make_output( base_description: str, timestamp=None, ): - print(f"== All Results {base_description} ====") print_timers(data) @@ -468,8 +519,7 @@ def make_output( def run_square_bench(args): - dim_sizes = list( - range(args.dim_start, args.dim_end + 1, args.dim_increment)) + dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) data = run(args.dtype, args.sweep_schedules, MKNs) @@ -479,8 +529,9 @@ def run_square_bench(args): def run_range_bench(args): m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) - m_increment, k_increment, n_increment = \ - (int(x) for x in args.dim_increment.split(",")) + m_increment, k_increment, n_increment = ( + int(x) for x in args.dim_increment.split(",") + ) Ms = list(range(m_start, m_end + 1, m_increment)) Ks = list(range(k_start, k_end + 1, k_increment)) Ns = list(range(n_start, n_end + 1, n_increment)) @@ -492,7 +543,6 @@ def run_range_bench(args): def run_model_bench(args): - print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") @@ -535,10 +585,13 @@ def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f: args_dict = vars(args) args_dict.pop("func") - pkl.dump({ - "args": args_dict, - "results": all_results, - }, f) + pkl.dump( + { + "args": args_dict, + "results": all_results, + }, + f, + ) if __name__ == "__main__": @@ -554,7 +607,6 @@ def to_torch_dtype(dt): }[dt] class ToTorchDtype(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, to_torch_dtype(values)) @@ -580,32 +632,32 @@ def __call__(self, parser, namespace, values, option_string=None): "--act-type", action=ToTorchDtype, required=True, - choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'], + choices=["bfloat16", "float16", "int8", "float8_e4m3fn"], ) parser.add_argument( "--group-scale-type", action=ToTorchDtype, - choices=['bfloat16', 'float16'], + choices=["bfloat16", "float16"], ) parser.add_argument( "--group-zero-type", type=to_torch_dtype, - choices=['bfloat16', 'float16'], + choices=["bfloat16", "float16"], ) parser.add_argument( "--channel-scale-type", action=ToTorchDtype, - choices=['float'], + choices=["float"], ) parser.add_argument( "--token-scale-type", action=ToTorchDtype, - choices=['float'], + choices=["float"], ) parser.add_argument( "--out-type", action=ToTorchDtype, - choices=['bfloat16', 'float16'], + choices=["bfloat16", "float16"], ) parser.add_argument( "--group-size", @@ -618,9 +670,11 @@ def __call__(self, parser, namespace, values, option_string=None): action="store_true", help="Run a sweep over all supported schedules", ) - parser.add_argument("--sweep-csv-out", - help="CSV to store sweep results", - default="sch_sweep_results.csv") + parser.add_argument( + "--sweep-csv-out", + help="CSV to store sweep results", + default="sch_sweep_results.csv", + ) subparsers = parser.add_subparsers(dest="cmd", required=True) square_parser = subparsers.add_parser("square_bench") @@ -634,17 +688,20 @@ def __call__(self, parser, namespace, values, option_string=None): "--dim-start", type=str, required=True, - help="Start value for M,K,N as common separated list") + help="Start value for M,K,N as common separated list", + ) range_parser.add_argument( "--dim-end", type=str, required=True, - help="End value (inclusive) for M,K,N as common separated list") + help="End value (inclusive) for M,K,N as common separated list", + ) range_parser.add_argument( "--dim-increment", type=str, required=True, - help="Increment value for M,K,N as common separated list") + help="Increment value for M,K,N as common separated list", + ) range_parser.set_defaults(func=run_range_bench) model_parser = subparsers.add_parser("model_bench") @@ -655,14 +712,12 @@ def __call__(self, parser, namespace, values, option_string=None): default=DEFAULT_MODELS, choices=WEIGHT_SHAPES.keys(), ) - model_parser.add_argument("--tp-sizes", - nargs="+", - type=int, - default=DEFAULT_TP_SIZES) - model_parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + model_parser.add_argument( + "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES + ) + model_parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) model_parser.set_defaults(func=run_model_bench) args = parser.parse_args() diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 1e785ac8fc73..b17baff2e5f5 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -6,19 +6,34 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( - GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, - GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) + GPTQ_MARLIN_24_MAX_PARALLEL, + GPTQ_MARLIN_24_MIN_THREAD_N, + GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, + GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES, +) from vllm.model_executor.layers.quantization.utils.allspark_utils import ( - ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES) + ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, + ALLSPARK_SUPPORTED_QUANT_TYPES, +) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, - MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types) + GPTQ_MARLIN_MAX_PARALLEL, + GPTQ_MARLIN_MIN_THREAD_N, + MARLIN_SUPPORTED_GROUP_SIZES, + query_marlin_supported_quant_types, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( - MarlinWorkspace, marlin_quantize) + MarlinWorkspace, + marlin_quantize, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( - marlin_24_quantize) + marlin_24_quantize, +) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) + gptq_pack, + gptq_quantize_weights, + quantize_weights, + sort_weights, +) from vllm.scalar_type import ScalarType from vllm.utils import FlexibleArgumentParser @@ -29,22 +44,29 @@ K_FULL_OPTS = [False, True] -def bench_run(results: list[benchmark.Measurement], model: str, - act_order: bool, is_k_full: bool, quant_type: ScalarType, - group_size: int, size_m: int, size_k: int, size_n: int): +def bench_run( + results: list[benchmark.Measurement], + model: str, + act_order: bool, + is_k_full: bool, + quant_type: ScalarType, + group_size: int, + size_m: int, + size_k: int, + size_n: int, +): label = "Quant Matmul" - sub_label = ("{}, act={} k_full={}, q={}, g={}, " - "MKN=({}x{}x{})".format(model, act_order, is_k_full, - str(quant_type), group_size, size_m, - size_k, size_n)) + sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( + model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n + ) print(f"Testing: {sub_label}") a = torch.randn(size_m, size_k).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda() - a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda()) + a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda() # Marlin quant ( @@ -57,14 +79,16 @@ def bench_run(results: list[benchmark.Measurement], model: str, ) = marlin_quantize(b, quant_type, group_size, act_order) # Marlin_24 quant - (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, - marlin_24_s) = marlin_24_quantize(b, quant_type, group_size) + (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = ( + marlin_24_quantize(b, quant_type, group_size) + ) marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) # GPTQ quant - (w_ref, q_w, s, g_idx, - rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order) + (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights( + b, quant_type, group_size, act_order + ) q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) # For act_order, sort the "weights" and "g_idx" @@ -74,32 +98,37 @@ def bench_run(results: list[benchmark.Measurement], model: str, (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx) # Prepare - marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL) + marlin_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL + ) - marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N, - GPTQ_MARLIN_24_MAX_PARALLEL) + marlin_24_workspace = MarlinWorkspace( + size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL + ) marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) # AllSpark W8A16 quant - as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES - and group_size == -1 and not act_order and is_k_full) + as_supported_case = ( + quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES + and group_size == -1 + and not act_order + and is_k_full + ) if as_supported_case: properties = torch.cuda.get_device_properties(b.device.index) sm_count = properties.multi_processor_count sm_version = properties.major * 10 + properties.minor - supported_arch = (sm_version >= 80 and sm_version < 90) + supported_arch = sm_version >= 80 and sm_version < 90 as_supported_case = as_supported_case and supported_arch if supported_arch: has_zp = False - w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, - has_zp) + w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) qw = qw.to(torch.uint8) - qw_reorder, s_reorder, zp_reorder = \ - ops.allspark_repack_weight( - qw, s, zp, has_zp) + qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( + qw, s, zp, has_zp + ) CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD globals = { @@ -136,8 +165,7 @@ def bench_run(results: list[benchmark.Measurement], model: str, "zp_reorder": zp_reorder if as_supported_case else None, "sm_count": sm_count if as_supported_case else None, "sm_version": sm_version if as_supported_case else None, - "CUBLAS_M_THRESHOLD": - CUBLAS_M_THRESHOLD if as_supported_case else None, + "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None, # Kernels "gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, @@ -158,60 +186,63 @@ def bench_run(results: list[benchmark.Measurement], model: str, label=label, sub_label=sub_label, description="pytorch_gemm", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) results.append( benchmark.Timer( - stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="gptq_marlin_gemm_fp16", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) results.append( benchmark.Timer( - stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="gptq_marlin_gemm_fp32", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) - if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES - and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES): + if ( + quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES + and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES + ): results.append( benchmark.Timer( - stmt= - "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 + stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="gptq_marlin_24_gemm", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) results.append( benchmark.Timer( - stmt= - "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 + stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="gptq_marlin_repack", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) if as_supported_case: results.append( benchmark.Timer( - stmt= - "output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 + stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, description="allspark_w8a16_gemm_fp32", - ).blocked_autorange(min_run_time=min_run_time)) + ).blocked_autorange(min_run_time=min_run_time) + ) def main(args): @@ -233,37 +264,50 @@ def main(args): continue for act_order in ACT_ORDER_OPTS: - if len(args.limit_act_order - ) > 0 and act_order not in args.limit_act_order: + if ( + len(args.limit_act_order) > 0 + and act_order not in args.limit_act_order + ): continue for is_k_full in K_FULL_OPTS: - if len(args.limit_k_full - ) > 0 and is_k_full not in args.limit_k_full: + if ( + len(args.limit_k_full) > 0 + and is_k_full not in args.limit_k_full + ): continue - for quant_type in query_marlin_supported_quant_types( - False): - if len(args.limit_num_bits) > 0 and \ - quant_type.size_bits not in args.limit_num_bits: + for quant_type in query_marlin_supported_quant_types(False): + if ( + len(args.limit_num_bits) > 0 + and quant_type.size_bits not in args.limit_num_bits + ): continue for group_size in MARLIN_SUPPORTED_GROUP_SIZES: - if len( - args.limit_group_size - ) > 0 and group_size not in args.limit_group_size: + if ( + len(args.limit_group_size) > 0 + and group_size not in args.limit_group_size + ): continue # For act_order, the group_size must be less than # size_k - if act_order and (group_size == size_k - or group_size == -1): + if act_order and (group_size == size_k or group_size == -1): continue for size_m in args.batch_sizes: - bench_run(results, model, act_order, is_k_full, - quant_type, group_size, size_m, - size_k, size_n) + bench_run( + results, + model, + act_order, + is_k_full, + quant_type, + group_size, + size_m, + size_k, + size_n, + ) compare = benchmark.Compare(results) compare.print() @@ -274,7 +318,8 @@ def main(args): # if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark Marlin across specified models/shapes/batches") + description="Benchmark Marlin across specified models/shapes/batches" + ) parser.add_argument( "--models", nargs="+", @@ -282,10 +327,9 @@ def main(args): default=DEFAULT_MODELS, choices=WEIGHT_SHAPES.keys(), ) - parser.add_argument("--batch-sizes", - nargs="+", - type=int, - default=DEFAULT_BATCH_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) parser.add_argument("--limit-k", nargs="+", type=int, default=[]) parser.add_argument("--limit-n", nargs="+", type=int, default=[]) parser.add_argument("--limit-group-size", nargs="+", type=int, default=[]) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 4e328b4d49e5..c2f7660858f5 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -31,56 +31,60 @@ class BenchmarkConfig(TypedDict): num_stages: int -def benchmark_config(config: BenchmarkConfig, - num_tokens: int, - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - block_quant_shape: List[int] = None, - use_deep_gemm: bool = False) -> float: +def benchmark_config( + config: BenchmarkConfig, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + block_quant_shape: List[int] = None, + use_deep_gemm: bool = False, +) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) if use_int8_w8a16: - w1 = torch.randint(-127, - 127, ( - num_experts, - shard_intermediate_size, - hidden_size, - ), - dtype=torch.int8) - w2 = torch.randint(-127, - 127, ( - num_experts, - hidden_size, - shard_intermediate_size // 2, - ), - dtype=torch.int8) + w1 = torch.randint( + -127, + 127, + ( + num_experts, + shard_intermediate_size, + hidden_size, + ), + dtype=torch.int8, + ) + w2 = torch.randint( + -127, + 127, + ( + num_experts, + hidden_size, + shard_intermediate_size // 2, + ), + dtype=torch.int8, + ) else: - w1 = torch.randn(num_experts, - shard_intermediate_size, - hidden_size, - dtype=init_dtype) - w2 = torch.randn(num_experts, - hidden_size, - shard_intermediate_size // 2, - dtype=init_dtype) - gating_output = torch.randn(num_iters, - num_tokens, - num_experts, - dtype=torch.float32) + w1 = torch.randn( + num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype + ) + w2 = torch.randn( + num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype + ) + gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) w1_scale = None w2_scale = None a1_scale = None a2_scale = None if use_int8_w8a16: - w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size), - dtype=torch.float32) + w1_scale = torch.randn( + (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 + ) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) if use_fp8_w8a8: if block_quant_shape: @@ -93,10 +97,14 @@ def benchmark_config(config: BenchmarkConfig, n_tiles_w2 = (K + block_n - 1) // block_n k_tiles_w1 = (K + block_k - 1) // block_k k_tiles_w2 = (N + block_k - 1) // block_k - w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1), - dtype=torch.float32) * factor_for_scale - w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2), - dtype=torch.float32) * factor_for_scale + w1_scale = ( + torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) + * factor_for_scale + ) + w2_scale = ( + torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) + * factor_for_scale + ) else: w1_scale = torch.randn(num_experts, dtype=torch.float32) w2_scale = torch.randn(num_experts, dtype=torch.float32) @@ -114,10 +122,12 @@ def prepare(i: int): def run(): from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): if use_deep_gemm: topk_weights, topk_ids, token_expert_indices = fused_topk( - x, input_gating, topk, False) + x, input_gating, topk, False + ) return fused_experts( x, w1, @@ -213,8 +223,7 @@ def get_rocm_tuning_space(use_fp16): return param_ranges -def get_configs_compute_bound(use_fp16, - block_quant_shape) -> list[dict[str, int]]: +def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]: configs: list[BenchmarkConfig] = [] if current_platform.is_rocm(): @@ -250,20 +259,25 @@ def get_configs_compute_bound(use_fp16, if block_quant_shape is not None and not use_fp16: block_n, block_k = block_quant_shape[0], block_quant_shape[1] for config in configs[:]: - if config["BLOCK_SIZE_K"] % block_k != 0 or config[ - "BLOCK_SIZE_N"] % block_n != 0: + if ( + config["BLOCK_SIZE_K"] % block_k != 0 + or config["BLOCK_SIZE_N"] % block_n != 0 + ): configs.remove(config) return configs -def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, - search_space, is_fp16, topk): +def prune_rocm_search_space( + num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk +): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, - search_space, is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, - search_space, is_fp16) + pruned_space_1 = prune_rocm_configs( + num_tokens * topk, N1, K1, search_space, is_fp16 + ) + pruned_space_2 = prune_rocm_configs( + num_tokens * topk, N2, K2, search_space, is_fp16 + ) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -301,14 +315,14 @@ def prune_rocm_configs(M, N, K, configs, is_fp16=True): SPLIT_K = config.get("SPLIT_K", 1) GROUP_M = config.get("GROUP_SIZE_M") if is_fp16: - if (matrix_instr_nonkdim > BLOCK_SIZE_M - or matrix_instr_nonkdim > BLOCK_SIZE_N): + if ( + matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N + ): continue - if (matrix_instr_nonkdim >= M - and matrix_instr_nonkdim != BLOCK_SIZE_M): + if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M: continue - if (matrix_instr_nonkdim >= N - and matrix_instr_nonkdim != BLOCK_SIZE_N): + if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N: continue # Skip BLOCK_SIZE that is too large compare to M/N # unless BLOCK_SIZE is already small enough @@ -329,8 +343,10 @@ def prune_rocm_configs(M, N, K, configs, is_fp16=True): continue # out of shared memory resource # TODO (zhanglx): This does not consider the LDS usage in the epilogue - LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + - BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b) + LDS = ( + BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b + ) if LDS > 65536: continue # Skip small block sizes and num_warps for large gemm @@ -364,7 +380,6 @@ def merge_unique_dicts(list1, list2): @ray.remote(num_gpus=1) class BenchmarkWorker: - def __init__(self, seed: int) -> None: torch.set_default_device("cuda") current_platform.seed_everything(seed) @@ -388,36 +403,40 @@ def benchmark( use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) - dtype_str = get_config_dtype_str(dtype, - use_int8_w8a16=use_int8_w8a16, - use_fp8_w8a8=use_fp8_w8a8) + dtype_str = get_config_dtype_str( + dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. - op_config = get_moe_configs(num_experts, shard_intermediate_size // 2, - dtype_str) + op_config = get_moe_configs( + num_experts, shard_intermediate_size // 2, dtype_str + ) if op_config is None: - config = get_default_config(num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype_str, - is_marlin=False) + config = get_default_config( + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype_str, + is_marlin=False, + ) else: - config = op_config[min(op_config.keys(), - key=lambda x: abs(x - num_tokens))] - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=100, - block_quant_shape=block_quant_shape, - use_deep_gemm=use_deep_gemm) + config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] + kernel_time = benchmark_config( + config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + block_quant_shape=block_quant_shape, + use_deep_gemm=use_deep_gemm, + ) return config, kernel_time def tune( @@ -438,10 +457,14 @@ def tune( best_time = float("inf") if current_platform.is_rocm(): is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = prune_rocm_search_space(num_tokens, - shard_intermediate_size, - hidden_size, search_space, - is_fp16, topk) + search_space = prune_rocm_search_space( + num_tokens, + shard_intermediate_size, + hidden_size, + search_space, + is_fp16, + topk, + ) need_device_guard = False if current_platform.is_rocm(): @@ -449,8 +472,7 @@ def tune( if visible_device != f"{self.device_id}": need_device_guard = True - with torch.cuda.device( - self.device_id) if need_device_guard else nullcontext(): + with torch.cuda.device(self.device_id) if need_device_guard else nullcontext(): for config in tqdm(search_space): try: kernel_time = benchmark_config( @@ -465,7 +487,8 @@ def tune( use_int8_w8a16, num_iters=20, block_quant_shape=block_quant_shape, - use_deep_gemm=use_deep_gemm) + use_deep_gemm=use_deep_gemm, + ) except triton.runtime.autotuner.OutOfResources: # Some configurations may be invalid and fail to compile. continue @@ -481,42 +504,44 @@ def tune( def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { - "BLOCK_SIZE_M": - config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": - config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": - config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": - config["GROUP_SIZE_M"], - "num_warps": - config["num_warps"], - "num_stages": - config["num_stages"], - **({ - "waves_per_eu": config["waves_per_eu"] - } if "waves_per_eu" in config else {}), - **({ - "matrix_instr_nonkdim": config["matrix_instr_nonkdim"] - } if "matrix_instr_nonkdim" in config else {}), - **({ - "kpack": config["kpack"] - } if "kpack" in config else {}), + "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": config["GROUP_SIZE_M"], + "num_warps": config["num_warps"], + "num_stages": config["num_stages"], + **( + {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {} + ), + **( + {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]} + if "matrix_instr_nonkdim" in config + else {} + ), + **({"kpack": config["kpack"]} if "kpack" in config else {}), } -def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, - shard_intermediate_size: int, hidden_size: int, topk: int, - dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int]) -> None: - dtype_str = get_config_dtype_str(dtype, - use_int8_w8a16=use_int8_w8a16, - use_fp8_w8a8=use_fp8_w8a8) +def save_configs( + configs: dict[int, BenchmarkConfig], + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + block_quant_shape: List[int], +) -> None: + dtype_str = get_config_dtype_str( + dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. - filename = get_config_file_name(num_experts, shard_intermediate_size // 2, - dtype_str, block_quant_shape) + filename = get_config_file_name( + num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape + ) print(f"Writing best config to {filename}...") with open(filename, "w") as f: @@ -525,18 +550,16 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, def get_weight_block_size_safety(config, default_value=None): - - quantization_config = getattr(config, 'quantization_config', {}) + quantization_config = getattr(config, "quantization_config", {}) if isinstance(quantization_config, dict): - return quantization_config.get('weight_block_size', default_value) + return quantization_config.get("weight_block_size", default_value) return default_value def main(args: argparse.Namespace): print(args) - config = get_config(model=args.model, - trust_remote_code=args.trust_remote_code) + config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) if args.model_prefix: config = getattr(config, args.model_prefix) config = SimpleNamespace(**config) @@ -551,14 +574,12 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif (config.architectures[0] - in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM")): + elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", - "Qwen3MoeForCausalLM"): + elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size @@ -573,16 +594,35 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else getattr( - torch, config.torch_dtype) + dtype = ( + torch.float16 + if current_platform.is_rocm() + else getattr(torch, config.torch_dtype) + ) use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) if args.batch_size is None: batch_sizes = [ - 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, - 2048, 3072, 4096 + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, ] else: batch_sizes = [args.batch_size] @@ -593,7 +633,8 @@ def main(args: argparse.Namespace): # Ray will set ROCR_VISIBLE_DEVICES for device visibility logger.warning( "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility." - "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.") + "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES." + ) val = os.environ["HIP_VISIBLE_DEVICES"] os.environ["ROCR_VISIBLE_DEVICES"] = val del os.environ["HIP_VISIBLE_DEVICES"] @@ -620,25 +661,59 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: start = time.time() configs = _distribute( - "tune", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space, - block_quant_shape, use_deep_gemm) - for batch_size in batch_sizes]) + "tune", + [ + ( + batch_size, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + search_space, + block_quant_shape, + use_deep_gemm, + ) + for batch_size in batch_sizes + ], + ) best_configs = { - M: sort_config(config) - for M, config in zip(batch_sizes, configs) + M: sort_config(config) for M, config in zip(batch_sizes, configs) } - save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, - block_quant_shape) + save_configs( + best_configs, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + block_quant_shape, + ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: outputs = _distribute( "benchmark", - [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, - use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm) - for batch_size in batch_sizes]) + [ + ( + batch_size, + E, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + block_quant_shape, + use_deep_gemm, + ) + for batch_size in batch_sizes + ], + ) for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): print(f"Batch size: {batch_size}, config: {config}") @@ -647,18 +722,15 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: if __name__ == "__main__": parser = FlexibleArgumentParser() - parser.add_argument("--model", - type=str, - default="mistralai/Mixtral-8x7B-Instruct-v0.1") - parser.add_argument("--tp-size", - "-tp", - "--tensor-parallel-size", - type=int, - default=2) - parser.add_argument("--dtype", - type=str, - choices=["auto", "fp8_w8a8", "int8_w8a16"], - default="auto") + parser.add_argument( + "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" + ) + parser.add_argument( + "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 + ) + parser.add_argument( + "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" + ) parser.add_argument("--use-deep-gemm", action="store_true") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 937df9624651..333986fdf5ef 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -8,7 +8,9 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( - _moe_permute, _moe_unpermute_and_reduce) + _moe_permute, + _moe_unpermute_and_reduce, +) from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize @@ -27,15 +29,17 @@ class BenchmarkConfig(TypedDict): num_stages: int -def benchmark_permute(num_tokens: int, - num_experts: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - use_customized_permute: bool = False) -> float: +def benchmark_permute( + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False, +) -> float: # init_dtype = torch.float16 if use_fp8_w8a8 else dtype hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) # output_hidden_states = torch.empty_like(hidden_states) @@ -46,36 +50,41 @@ def benchmark_permute(num_tokens: int, align_block_size = None qhidden_states = hidden_states - gating_output = torch.randn(num_iters, - num_tokens, - num_experts, - dtype=torch.float32) + gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32) input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) topk_weights, topk_ids, token_expert_indices = fused_topk( - qhidden_states, input_gating, topk, False) + qhidden_states, input_gating, topk, False + ) def prepare(i: int): input_gating.copy_(gating_output[i]) def run(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, - m_indices) = moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( + moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + ) else: - (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) = _moe_permute(qhidden_states, None, topk_ids, - num_experts, None, align_block_size) + ( + permuted_hidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = _moe_permute( + qhidden_states, None, topk_ids, num_experts, None, align_block_size + ) # JIT compilation & warmup run() @@ -111,15 +120,17 @@ def run(): return avg -def benchmark_unpermute(num_tokens: int, - num_experts: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - use_customized_permute: bool = False) -> float: +def benchmark_unpermute( + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False, +) -> float: # init_dtype = torch.float16 if use_fp8_w8a8 else dtype hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) output_hidden_states = torch.empty_like(hidden_states) @@ -133,46 +144,74 @@ def benchmark_unpermute(num_tokens: int, input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) topk_weights, topk_ids, token_expert_indices = fused_topk( - qhidden_states, input_gating, topk, False) + qhidden_states, input_gating, topk, False + ) def prepare(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, - m_indices) = moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( + moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + ) # convert to fp16/bf16 as gemm output - return (permuted_hidden_states.to(dtype), first_token_off, - inv_perm_idx, m_indices) + return ( + permuted_hidden_states.to(dtype), + first_token_off, + inv_perm_idx, + m_indices, + ) else: - (permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) = _moe_permute(qhidden_states, None, topk_ids, - num_experts, None, align_block_size) + ( + permuted_qhidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = _moe_permute( + qhidden_states, None, topk_ids, num_experts, None, align_block_size + ) # convert to fp16/bf16 as gemm output - return (permuted_qhidden_states.to(dtype), a1q_scale, - sorted_token_ids, expert_ids, inv_perm) + return ( + permuted_qhidden_states.to(dtype), + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) def run(input: tuple): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, - m_indices) = input - moe_unpermute(permuted_hidden_states, topk_weights, topk_ids, - inv_perm_idx, first_token_off, topk, num_experts, - num_experts) + (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input + moe_unpermute( + permuted_hidden_states, + topk_weights, + topk_ids, + inv_perm_idx, + first_token_off, + topk, + num_experts, + num_experts, + ) else: - (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) = input - _moe_unpermute_and_reduce(output_hidden_states, - permuted_hidden_states, inv_perm, - topk_weights) + ( + permuted_hidden_states, + a1q_scale, + sorted_token_ids, + expert_ids, + inv_perm, + ) = input + _moe_unpermute_and_reduce( + output_hidden_states, permuted_hidden_states, inv_perm, topk_weights + ) # JIT compilation & warmup input = prepare() @@ -209,7 +248,6 @@ def run(input: tuple): @ray.remote(num_gpus=1) class BenchmarkWorker: - def __init__(self, seed: int) -> None: torch.set_default_device("cuda") current_platform.seed_everything(seed) @@ -241,7 +279,8 @@ def benchmark( use_fp8_w8a8, use_int8_w8a16, num_iters=100, - use_customized_permute=use_customized_permute) + use_customized_permute=use_customized_permute, + ) unpermute_time = benchmark_unpermute( num_tokens, num_experts, @@ -251,15 +290,15 @@ def benchmark( use_fp8_w8a8, use_int8_w8a16, num_iters=100, - use_customized_permute=use_customized_permute) + use_customized_permute=use_customized_permute, + ) return permute_time, unpermute_time def get_weight_block_size_safety(config, default_value=None): - - quantization_config = getattr(config, 'quantization_config', {}) + quantization_config = getattr(config, "quantization_config", {}) if isinstance(quantization_config, dict): - return quantization_config.get('weight_block_size', default_value) + return quantization_config.get("weight_block_size", default_value) return default_value @@ -267,20 +306,21 @@ def main(args: argparse.Namespace): print(args) config = AutoConfig.from_pretrained( - args.model, trust_remote_code=args.trust_remote_code) + args.model, trust_remote_code=args.trust_remote_code + ) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k elif config.architectures[0] == "JambaForCausalLM": E = config.num_experts topk = config.num_experts_per_tok - elif (config.architectures[0] == "DeepseekV3ForCausalLM" - or config.architectures[0] == "DeepseekV2ForCausalLM"): + elif ( + config.architectures[0] == "DeepseekV3ForCausalLM" + or config.architectures[0] == "DeepseekV2ForCausalLM" + ): E = config.n_routed_experts topk = config.num_experts_per_tok - elif config.architectures[0] in [ - "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM" - ]: + elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]: E = config.num_experts topk = config.num_experts_per_tok @@ -299,8 +339,24 @@ def main(args: argparse.Namespace): if args.batch_size is None: batch_sizes = [ - 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, - 2048, 3072, 4096 + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, ] else: batch_sizes = [args.batch_size] @@ -321,9 +377,21 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: return ray.get(outputs) outputs = _distribute( - "benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8, - use_int8_w8a16, use_customized_permute) - for batch_size in batch_sizes]) + "benchmark", + [ + ( + batch_size, + E, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + use_customized_permute, + ) + for batch_size in batch_sizes + ], + ) for batch_size, (permute, unpermute) in zip(batch_sizes, outputs): print(f"Batch size: {batch_size}") @@ -333,13 +401,12 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: if __name__ == "__main__": parser = FlexibleArgumentParser() - parser.add_argument("--model", - type=str, - default="mistralai/Mixtral-8x7B-Instruct-v0.1") - parser.add_argument("--dtype", - type=str, - choices=["auto", "fp8_w8a8", "int8_w8a16"], - default="auto") + parser.add_argument( + "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" + ) + parser.add_argument( + "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" + ) parser.add_argument("--use-customized-permute", action="store_true") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 2625239b08ef..17432159c94e 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -9,8 +9,11 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - create_kv_caches_with_random) +from vllm.utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + FlexibleArgumentParser, + create_kv_caches_with_random, +) logger = init_logger(__name__) @@ -38,19 +41,15 @@ def main( current_platform.seed_everything(seed) scale = float(1.0 / (head_size**0.5)) - query = torch.empty(num_seqs, - num_query_heads, - head_size, - dtype=dtype, - device=device) + query = torch.empty( + num_seqs, num_query_heads, head_size, dtype=dtype, device=device + ) query.uniform_(-scale, scale) assert num_query_heads % num_kv_heads == 0 alibi_slopes = None if use_alibi: - alibi_slopes = torch.randn(num_query_heads, - dtype=torch.float, - device=device) + alibi_slopes = torch.randn(num_query_heads, dtype=torch.float, device=device) seq_lens = [seq_len for _ in range(num_seqs)] max_seq_len = max(seq_lens) @@ -61,24 +60,23 @@ def main( block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ - random.randint(0, NUM_BLOCKS - 1) - for _ in range(max_num_blocks_per_seq) + random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] block_tables_lst.append(block_table) - block_tables = torch.tensor(block_tables_lst, - dtype=torch.int, - device=device) + block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device) # Create the KV cache. - key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, - block_size, - 1, - num_kv_heads, - head_size, - kv_cache_dtype, - dtype, - device=device) + key_caches, value_caches = create_kv_caches_with_random( + NUM_BLOCKS, + block_size, + 1, + num_kv_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + ) key_cache, value_cache = key_caches[0], value_caches[0] # Prepare for the paged attention kernel. @@ -86,11 +84,8 @@ def main( if version == "v2": if current_platform.is_rocm(): global PARTITION_SIZE - if not args.custom_paged_attn: - PARTITION_SIZE = 1024 - else: - PARTITION_SIZE = PARTITION_SIZE_ROCM - num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) + PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM + num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE tmp_output = torch.empty( size=(num_seqs, num_query_heads, num_partitions, head_size), dtype=output.dtype, @@ -110,9 +105,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: start_time = time.perf_counter() # Using default kv_scale - k_scale = v_scale = torch.tensor(1.0, - dtype=torch.float32, - device=device) + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) for _ in range(num_iters): if version == "v1": @@ -195,30 +188,29 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: print(f"Kernel running time: {latency * 1000000:.3f} us") -if __name__ == '__main__': - logger.warning("This script benchmarks the paged attention kernel. " - "By default this is no longer used in vLLM inference.") +if __name__ == "__main__": + logger.warning( + "This script benchmarks the paged attention kernel. " + "By default this is no longer used in vLLM inference." + ) - parser = FlexibleArgumentParser( - description="Benchmark the paged attention kernel.") - parser.add_argument("--version", - type=str, - choices=["v1", "v2"], - default="v2") + parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.") + parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2") parser.add_argument("--batch-size", type=int, default=8) parser.add_argument("--seq-len", type=int, default=4096) parser.add_argument("--num-query-heads", type=int, default=64) parser.add_argument("--num-kv-heads", type=int, default=8) - parser.add_argument("--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") - parser.add_argument("--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="half") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--profile", action="store_true") parser.add_argument( @@ -228,10 +220,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: default="auto", help="Data type for kv cache storage. If 'auto', will use model " "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " - "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)") - parser.add_argument("--custom-paged-attn", - action="store_true", - help="Use custom paged attention") + "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)", + ) + parser.add_argument( + "--custom-paged-attn", action="store_true", help="Use custom paged attention" + ) args = parser.parse_args() print(args) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index b643897a60ee..2463dfebe83c 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -10,15 +10,17 @@ @torch.inference_mode() -def main(num_tokens: int, - hidden_size: int, - static_scale: bool, - quant_dtype: torch.dtype, - dtype: torch.dtype, - seed: int = 0, - do_profile: bool = False, - num_warmup_iters: int = 5, - num_iters: int = 100) -> None: +def main( + num_tokens: int, + hidden_size: int, + static_scale: bool, + quant_dtype: torch.dtype, + dtype: torch.dtype, + seed: int = 0, + do_profile: bool = False, + num_warmup_iters: int = 5, + num_iters: int = 100, +) -> None: current_platform.seed_everything(seed) torch.set_default_device("cuda") @@ -56,7 +58,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: print(f"Kernel running time: {latency * 1000000:.3f} us") -if __name__ == '__main__': +if __name__ == "__main__": def to_torch_dtype(dt): if dt == "int8": @@ -66,37 +68,40 @@ def to_torch_dtype(dt): raise ValueError(f"Unsupported dtype: {dt}") parser = FlexibleArgumentParser( - description="Benchmark the quantization (fp8 or int8) kernel.") + description="Benchmark the quantization (fp8 or int8) kernel." + ) parser.add_argument("--num-tokens", type=int, default=4096) parser.add_argument("--hidden-size", type=int, default=8192) parser.add_argument("--static-scale", action="store_true") - parser.add_argument("--quant-dtype", - type=str, - choices=["fp8", "int8"], - default="int8") - parser.add_argument("--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="half") + parser.add_argument( + "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8" + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--profile", action="store_true") parser.add_argument("--num-warmup-iters", type=int, default=5) - parser.add_argument("--num-iters", - type=int, - default=100, - help="Number of benchmark iterations. " - "If --profile is set, this number is ignored") + parser.add_argument( + "--num-iters", + type=int, + default=100, + help="Number of benchmark iterations. " + "If --profile is set, this number is ignored", + ) args = parser.parse_args() print(args) - main(num_tokens=args.num_tokens, - hidden_size=args.hidden_size, - static_scale=args.static_scale, - quant_dtype=to_torch_dtype(args.quant_dtype), - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - num_warmup_iters=args.num_warmup_iters, - num_iters=args.num_iters) + main( + num_tokens=args.num_tokens, + hidden_size=args.hidden_size, + static_scale=args.static_scale, + quant_dtype=to_torch_dtype(args.quant_dtype), + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + seed=args.seed, + do_profile=args.profile, + num_warmup_iters=args.num_warmup_iters, + num_iters=args.num_iters, + ) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 09a319ccf1d1..d720083b6150 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -12,7 +12,6 @@ class HuggingFaceRMSNorm(nn.Module): - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) @@ -114,23 +113,19 @@ def rmsnorm_vllm( def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): dtype = torch.bfloat16 - x = torch.randn(batch_size, - seq_len, - hidden_size, - dtype=dtype, - device="cuda") + x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") weight = torch.ones(hidden_size, dtype=dtype, device="cuda") residual = torch.randn_like(x) if use_residual else None output_naive = rmsnorm_naive( - x.clone(), weight, - residual.clone() if residual is not None else None) + x.clone(), weight, residual.clone() if residual is not None else None + ) output_flashinfer = rmsnorm_flashinfer( - x.clone(), weight, - residual.clone() if residual is not None else None) + x.clone(), weight, residual.clone() if residual is not None else None + ) output_vllm = rmsnorm_vllm( - x.clone(), weight, - residual.clone() if residual is not None else None) + x.clone(), weight, residual.clone() if residual is not None else None + ) if use_residual: output_naive = output_naive[0] @@ -141,9 +136,9 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): print(f"FlashInfer output={output_flashinfer}") print(f"vLLM output={output_vllm}") - if torch.allclose(output_naive, output_flashinfer, atol=1e-2, - rtol=1e-2) and torch.allclose( - output_naive, output_vllm, atol=1e-2, rtol=1e-2): + if torch.allclose( + output_naive, output_flashinfer, atol=1e-2, rtol=1e-2 + ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): print("✅ All implementations match") else: print("❌ Implementations differ") @@ -152,12 +147,10 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): batch_size_range = [2**i for i in range(0, 7, 2)] seq_length_range = [2**i for i in range(6, 11, 1)] head_num_range = [32, 48] -configs = list( - itertools.product(head_num_range, batch_size_range, seq_length_range)) +configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range)) def get_benchmark(use_residual): - @triton.testing.perf_report( triton.testing.Benchmark( x_names=["head_num", "batch_size", "seq_len"], @@ -167,19 +160,15 @@ def get_benchmark(use_residual): line_names=["HuggingFace", "FlashInfer", "vLLM"], styles=[("blue", "-"), ("green", "-"), ("red", "-")], ylabel="us", - plot_name= - f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual", + plot_name=f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual", args={}, - )) + ) + ) def benchmark(head_num, batch_size, seq_len, provider): dtype = torch.bfloat16 hidden_size = head_num * 128 # assuming head_dim = 128 - x = torch.randn(batch_size, - seq_len, - hidden_size, - dtype=dtype, - device="cuda") + x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda") weight = torch.ones(hidden_size, dtype=dtype, device="cuda") residual = torch.randn_like(x) if use_residual else None @@ -240,9 +229,9 @@ def benchmark(head_num, batch_size, seq_len, provider): default=4096, help="Hidden size (2nd dimension) of the sequence", ) - parser.add_argument("--use-residual", - action="store_true", - help="Whether to use residual connection") + parser.add_argument( + "--use-residual", action="store_true", help="Whether to use residual connection" + ) parser.add_argument( "--save-path", type=str, @@ -253,10 +242,12 @@ def benchmark(head_num, batch_size, seq_len, provider): args = parser.parse_args() # Run correctness test - calculate_diff(batch_size=args.batch_size, - seq_len=args.seq_len, - hidden_size=args.hidden_size, - use_residual=args.use_residual) + calculate_diff( + batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_size=args.hidden_size, + use_residual=args.use_residual, + ) # Get the benchmark function with proper use_residual setting benchmark = get_benchmark(args.use_residual) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 05d24fc4b16d..110d36db157f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -6,8 +6,7 @@ import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, - get_rope) +from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser @@ -32,40 +31,49 @@ def benchmark_rope_kernels_multi_lora( # silulating serving 4 LoRAs scaling_factors = [1, 2, 4, 8] # batched RoPE can take multiple scaling factors - batched_rope = get_rope(head_size, rotary_dim, max_position, base, - is_neox_style, { - "rope_type": "linear", - "factor": tuple(scaling_factors) - }) + batched_rope = get_rope( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + {"rope_type": "linear", "factor": tuple(scaling_factors)}, + ) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior non_batched_ropes: list[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( - get_rope(head_size, rotary_dim, max_position, base, is_neox_style, - { - "rope_type": "linear", - "factor": (scaling_factor, ) - })) + get_rope( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + {"rope_type": "linear", "factor": (scaling_factor,)}, + ) + ) positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype) + query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype) key = torch.randn_like(query) # create query offsets for batched RoPE, we concat multiple kv cache # together and each query needs to find the right kv cache of its type offset_map = torch.tensor( list( - accumulate([0] + [ - max_position * scaling_factor * 2 - for scaling_factor in scaling_factors[:-1] - ]))) - query_types = torch.randint(0, - len(scaling_factors), (batch_size, seq_len), - device=device) + accumulate( + [0] + + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ] + ) + ) + ) + query_types = torch.randint( + 0, len(scaling_factors), (batch_size, seq_len), device=device + ) # map query types to offsets query_offsets = offset_map[query_types] # the kernel takes flattened offsets @@ -86,27 +94,28 @@ def benchmark_rope_kernels_multi_lora( torch.cuda.synchronize() -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description="Benchmark the rotary embedding kernels.") + description="Benchmark the rotary embedding kernels." + ) parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) parser.add_argument("--seq-len", type=int, default=512) parser.add_argument("--num-heads", type=int, default=8) - parser.add_argument("--head-size", - type=int, - choices=[64, 80, 96, 112, 120, 128, 192, 256], - default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) - parser.add_argument("--dtype", - type=str, - choices=["bfloat16", "float"], - default="float") + parser.add_argument( + "--dtype", type=str, choices=["bfloat16", "float"], default="float" + ) parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--device", - type=str, - choices=["cuda:0", "cuda:1"], - default="cuda:0") + parser.add_argument( + "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0" + ) args = parser.parse_args() print(args) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 8f07bc8ca52e..6315c1ee6cdd 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -14,14 +14,16 @@ import triton from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - _w8a8_block_fp8_matmul) + _w8a8_block_fp8_matmul, +) from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) -assert current_platform.is_cuda( -), "Only support tune w8a8 block fp8 kernel on CUDA device." +assert current_platform.is_cuda(), ( + "Only support tune w8a8 block fp8 kernel on CUDA device." +) DTYPE_MAP = { "float32": torch.float32, @@ -40,7 +42,7 @@ def w8a8_block_matmul( config: dict[str, Any], output_dtype: torch.dtype = torch.float16, ) -> torch.Tensor: - """This function performs matrix multiplication with + """This function performs matrix multiplication with block-wise quantization. It takes two input tensors `A` and `B` with scales `As` and `Bs`. @@ -51,7 +53,7 @@ def w8a8_block_matmul( B: The input tensor, e.g., weight. As: The per-token-group quantization scale for `A`. Bs: The per-block quantization scale for `B`. - block_size: The block size for per-block quantization. + block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. output_dytpe: The dtype of the returned tensor. @@ -71,18 +73,18 @@ def w8a8_block_matmul( assert triton.cdiv(N, block_n) == Bs.shape[0] assert triton.cdiv(K, block_k) == Bs.shape[1] - C_shape = A.shape[:-1] + (N, ) + C_shape = A.shape[:-1] + (N,) C = A.new_empty(C_shape, dtype=output_dtype) def grid(META): - return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * - triton.cdiv(N, META["BLOCK_SIZE_N"]), ) + return ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) if A.dtype == torch.float8_e4m3fn: kernel = _w8a8_block_fp8_matmul else: - raise RuntimeError( - "Currently, only support tune w8a8 block fp8 kernel.") + raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") kernel[grid]( A, @@ -119,14 +121,16 @@ def get_configs_compute_bound(): for block_n in [32, 64, 128, 256]: for num_warps in [4, 8]: for group_size in [1, 16, 32, 64]: - configs.append({ - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - }) + configs.append( + { + "BLOCK_SIZE_M": block_m, + "BLOCK_SIZE_N": block_n, + "BLOCK_SIZE_K": block_k, + "GROUP_SIZE_M": group_size, + "num_warps": num_warps, + "num_stages": num_stages, + } + ) return configs @@ -165,15 +169,9 @@ def get_weight_shapes(tp_size): return weight_shapes -def benchmark_config(A, - B, - As, - Bs, - block_size, - config, - out_dtype=torch.float16, - num_iters=10): - +def benchmark_config( + A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10 +): def run(): w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) @@ -206,26 +204,26 @@ def tune(M, N, K, block_size, out_dtype, search_space, input_type): fp8_max, fp8_min = fp8_info.max, fp8_info.min A_fp32 = ( - (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * - fp8_max) + (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max + ) A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) B_fp32 = ( - (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * - fp8_max) + (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max + ) B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) else: - raise RuntimeError( - "Currently, only support tune w8a8 block fp8 kernel.") + raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n k_tiles = (K + block_k - 1) // block_k - As = torch.rand(M, k_tiles, dtype=torch.float32, - device="cuda") * factor_for_scale - Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") * - factor_for_scale) + As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") + * factor_for_scale + ) best_config = None best_time = float("inf") @@ -267,7 +265,8 @@ def save_configs( device_name = current_platform.get_device_name().replace(" ", "_") json_file_name = ( f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8," - f"block_shape=[{block_n},{block_k}].json") + f"block_shape=[{block_n},{block_k}].json" + ) config_file_path = os.path.join(save_path, json_file_name) print(f"Writing best config to {config_file_path}...") @@ -295,8 +294,7 @@ def tune_on_gpu(args_dict): search_space = get_configs_compute_bound() search_space = [ - config for config in search_space - if block_k % config["BLOCK_SIZE_K"] == 0 + config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0 ] start = time.time() @@ -312,15 +310,11 @@ def tune_on_gpu(args_dict): out_dtype, search_space, input_type, - ) for batch_size in tqdm(batch_sizes, - desc=f"GPU {gpu_id} - Batch sizes") + ) + for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes") ] - best_configs = { - M: config - for M, config in zip(batch_sizes, benchmark_results) - } - save_configs(N, K, block_n, block_k, best_configs, save_path, - input_type) + best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)} + save_configs(N, K, block_n, block_k, best_configs, save_path, input_type) end = time.time() print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds") @@ -376,13 +370,14 @@ def main(args): process_args = [] for gpu_id in range(num_gpus): - process_args.append({ - "gpu_id": gpu_id, - "batch_sizes": batches_per_gpu[gpu_id], - "weight_shapes": - weight_shapes, # Each GPU processes all weight shapes - "args": args, - }) + process_args.append( + { + "gpu_id": gpu_id, + "batch_sizes": batches_per_gpu[gpu_id], + "weight_shapes": weight_shapes, # Each GPU processes all weight shapes + "args": args, + } + ) ctx = mp.get_context("spawn") with ctx.Pool(num_gpus) as pool: @@ -398,13 +393,11 @@ def main(args): python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8 Then copy to model_executor/layers/quantization/utils/configs """, - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + ) parser.add_argument("--tp-size", "-tp", type=int, default=8) - parser.add_argument("--input-type", - type=str, - choices=["fp8"], - default="fp8") + parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8") parser.add_argument( "--out-dtype", type=str, diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 5fa55bb974e1..e37764825451 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -11,7 +11,9 @@ # Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8, w8a8_block_fp8_matmul) + per_token_group_quant_fp8, + w8a8_block_fp8_matmul, +) from vllm.triton_utils import triton diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index bd62173a7b3a..ab364a84d6cb 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -14,13 +14,14 @@ if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('filename', type=str) + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("filename", type=str) args = parser.parse_args() - with open(args.filename, 'rb') as f: + with open(args.filename, "rb") as f: data = pickle.load(f) raw_results: list[TMeasurement] = data["results"] @@ -38,11 +39,7 @@ raise Exception("MKN not found") kernel = v.task_spec.description - results[KN].append({ - "kernel": kernel, - "batch_size": M, - "median": v.median - }) + results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median}) rows = int(math.ceil(len(results) / 2)) fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows)) @@ -50,14 +47,16 @@ for axs_idx, (shape, data) in enumerate(results.items()): plt.sca(axs[axs_idx]) df = pd.DataFrame(data) - sns.lineplot(data=df, - x="batch_size", - y="median", - hue="kernel", - style="kernel", - markers=True, - dashes=False, - palette="Dark2") + sns.lineplot( + data=df, + x="batch_size", + y="median", + hue="kernel", + style="kernel", + markers=True, + dashes=False, + palette="Dark2", + ) plt.title(f"Shape: {shape}") plt.ylabel("time (median, s)") plt.tight_layout() diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index ac64f786f184..877a29feed9d 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -23,6 +23,7 @@ class ArgPool: For every invocation during a benchmarking run, it will choose a different value from the list. """ + values: Iterable[Any] def __getitem__(self, index): @@ -30,9 +31,7 @@ def __getitem__(self, index): class Bench: - class ArgsIterator: - def __init__(self, args_list, kwargs_list): assert len(args_list) == len(kwargs_list) self.args_list = args_list @@ -53,10 +52,16 @@ def reset(self): def n_args(self): return self.n - def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], - label: str, sub_label: str, description: str, fn: Callable, - *args, **kwargs): - + def __init__( + self, + cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, + sub_label: str, + description: str, + fn: Callable, + *args, + **kwargs, + ): self.cuda_graph_params = cuda_graph_params self.use_cuda_graph = self.cuda_graph_params is not None self.label = label @@ -67,10 +72,8 @@ def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], # Process args self._args = args self._kwargs = kwargs - self.args_list, self.kwargs_list = self.collapse_argpool( - *args, **kwargs) - self.args_iterator = self.ArgsIterator(self.args_list, - self.kwargs_list) + self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list) # Cudagraph runner self.g = None @@ -100,16 +103,13 @@ def collapse_argpool(self, *args, **kwargs): for i in range(argpool_size): # collapse args; Just pick the ith value - args_list[i] = tuple([ - arg[i] if isinstance(arg, ArgPool) else arg - for arg in args_list[i] - ]) + args_list[i] = tuple( + [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]] + ) # collapse kwargs kwargs_i = kwargs_list[i] - arg_pool_keys = [ - k for k, v in kwargs_i.items() if isinstance(v, ArgPool) - ] + arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)] for k in arg_pool_keys: # again just pick the ith value kwargs_i[k] = kwargs_i[k][i] @@ -142,7 +142,7 @@ def get_cuda_graph_runner(self): def run_cudagrah(self) -> TMeasurement: assert self.use_cuda_graph - globals = {'g': self.g} + globals = {"g": self.g} return TBenchmark.Timer( stmt="g.replay()", @@ -162,15 +162,15 @@ def run_eager(self) -> TMeasurement: has_arg_pool = self.args_iterator.n_args > 1 if has_arg_pool: - setup = ''' + setup = """ args_iterator.reset() args_it = args_iterator.__next__() - ''' - stmt = ''' + """ + stmt = """ args, kwargs = next(args_it) fn(*args, **kwargs) - ''' - globals = {'fn': self.fn, 'args_iterator': self.args_iterator} + """ + globals = {"fn": self.fn, "args_iterator": self.args_iterator} else: # no arg pool. Just use the args and kwargs directly self.args_iterator.reset() @@ -178,10 +178,10 @@ def run_eager(self) -> TMeasurement: args, kwargs = next(args_it) setup = "" - stmt = ''' + stmt = """ fn(*args, **kwargs) - ''' - globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} + """ + globals = {"fn": self.fn, "args": args, "kwargs": kwargs} return TBenchmark.Timer( stmt=stmt, diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 5f94552e9dc8..d5701a8fbd6d 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -7,9 +7,8 @@ from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. -LONG_PROMPT = ["You are an expert in large language models, aren't you?" - ] * 1000 -LONG_PROMPT = ' '.join(LONG_PROMPT) +LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 +LONG_PROMPT = " ".join(LONG_PROMPT) def main(args): @@ -30,32 +29,35 @@ def main(args): print("------start generating------") for i in range(3): - profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)', - globals(), locals()) + profiler.runctx( + "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals() + ) # analyze the runtime of hashing function stats = pstats.Stats(profiler) - stats.sort_stats('cumulative') + stats.sort_stats("cumulative") total_time = 0 total_calls = 0 for func in stats.stats: - if 'hash_of_block' in func[2]: + if "hash_of_block" in func[2]: total_time = stats.stats[func][3] total_calls = stats.stats[func][0] percentage = (total_time / stats.total_tt) * 100 - print(f"Hashing took {total_time:.2f} seconds," - f"{percentage:.2f}% of the total runtime.") + print( + f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime." + ) if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the performance of hashing function in' - 'automatic prefix caching.') - parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') + description="Benchmark the performance of hashing function in" + "automatic prefix caching." + ) + parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k") + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--output-len", type=int, default=10) + parser.add_argument( + "--enable-prefix-caching", action="store_true", help="enable prefix caching" + ) args = parser.parse_args() main(args) diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml new file mode 100644 index 000000000000..718d52659dda --- /dev/null +++ b/benchmarks/pyproject.toml @@ -0,0 +1,56 @@ +# This local pyproject file is part of the migration from yapf to ruff format. +# It uses the same core rules as the main pyproject.toml file, but with the +# following differences: +# - isort profile is set to black +# - ruff line length is overridden to 88 +# - deprecated typing ignores (UP006, UP035) have been removed + +[tool.isort] +profile = "black" +known_local_folder = ["vllm"] + +[tool.ruff] +line-length = 88 +exclude = [ + # External file, leaving license intact + "examples/other/fp8/quantizer/quantize.py", + "vllm/vllm_flash_attn/flash_attn_interface.pyi" +] + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-logging-format + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.ruff.format] +docstring-code-format = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0393bb1ed27e..c9ad30510e6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ include = ["vllm*"] [tool.yapfignore] ignore_patterns = [ ".buildkite/**", + "benchmarks/**", "build/**", ] From 2ac2867ff7b4f56a2873b472832bdb32af6acf73 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 13 May 2025 11:39:46 +0100 Subject: [PATCH 2/4] Change vLLM to known first party Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index 718d52659dda..a1f59f78d2a8 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -7,7 +7,7 @@ [tool.isort] profile = "black" -known_local_folder = ["vllm"] +known_first_party = ["vllm"] [tool.ruff] line-length = 88 From 7b4509328d9e70f1d939d9e39e18cba02856d0f9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 13 May 2025 12:03:22 +0100 Subject: [PATCH 3/4] Skip `isort` as we transition to isort via `ruff` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/pyproject.toml | 4 ---- .pre-commit-config.yaml | 2 -- benchmarks/pyproject.toml | 9 ++++----- pyproject.toml | 4 ++++ 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml index 6ae0c2a3990f..083bb795caf5 100644 --- a/.buildkite/pyproject.toml +++ b/.buildkite/pyproject.toml @@ -1,13 +1,9 @@ # This local pyproject file is part of the migration from yapf to ruff format. # It uses the same core rules as the main pyproject.toml file, but with the # following differences: -# - isort profile is set to black # - ruff line length is overridden to 88 # - deprecated typing ignores (UP006, UP035) have been removed -[tool.isort] -profile = "black" - [tool.ruff] line-length = 88 exclude = [ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f01c041320b..f5c0c368d578 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,8 +28,6 @@ repos: rev: 6.0.1 hooks: - id: isort - # necessary during the transition from yapf to ruff format - args: [--resolve-all-configs, --config-root, .] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v20.1.3 hooks: diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index a1f59f78d2a8..c1450436a470 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -1,14 +1,9 @@ # This local pyproject file is part of the migration from yapf to ruff format. # It uses the same core rules as the main pyproject.toml file, but with the # following differences: -# - isort profile is set to black # - ruff line length is overridden to 88 # - deprecated typing ignores (UP006, UP035) have been removed -[tool.isort] -profile = "black" -known_first_party = ["vllm"] - [tool.ruff] line-length = 88 exclude = [ @@ -52,5 +47,9 @@ ignore = [ "UP007", ] +[tool.ruff.lint.isort] +known-first-party = ["vllm"] +known-local-folder = ["benchmarks"] + [tool.ruff.format] docstring-code-format = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c9ad30510e6c..cfe57094b2e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,6 +158,10 @@ ignore-words-list = "dout, te, indicies, subtile, ElementE" skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*" [tool.isort] +skip_glob = [ + ".buildkite/*", + "benchmarks/*", +] use_parentheses = true skip_gitignore = true From 2354bef4b4478cd7099218eb10cea4c61192bef0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 13 May 2025 12:20:33 +0100 Subject: [PATCH 4/4] Don't tell it about local folders Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_serving.py | 5 +++-- benchmarks/benchmark_serving_structured_output.py | 5 +++-- benchmarks/benchmark_throughput.py | 6 +++--- benchmarks/pyproject.toml | 1 - 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 713fcd0dc15b..d5aaceeb8c9c 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -11,9 +11,9 @@ import numpy as np import torch -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm import tqdm +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 883e4e47d028..a887e7150dc7 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -35,14 +35,15 @@ from typing import Any, Optional import numpy as np +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + from backend_request_func import ( ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, RequestFuncOutput, ) -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase try: from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 5432e0d60787..5088c805f53e 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -37,13 +37,14 @@ import datasets import numpy as np import pandas as pd +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + from backend_request_func import ( ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput, ) -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase try: from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index dff0fc862564..7a13babda9d1 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -12,6 +12,9 @@ import torch import uvloop +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase + from benchmark_dataset import ( AIMODataset, BurstGPTDataset, @@ -24,9 +27,6 @@ VisionArenaDataset, ) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase - from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args, diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index c1450436a470..f825cb203269 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -49,7 +49,6 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["vllm"] -known-local-folder = ["benchmarks"] [tool.ruff.format] docstring-code-format = true \ No newline at end of file