From a3ceb5c1f2b899c281c69f227d67d2a70ebab73c Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Feb 2025 05:45:26 +0000 Subject: [PATCH 1/3] add retries --- vllm/transformers_utils/config.py | 51 ++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 42b45e10e3f2..b45835886574 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,7 +5,7 @@ import os from pathlib import Path from typing import Any, Dict, Literal, Optional, Type, Union - +import time import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, list_repo_files, try_to_load_from_cache) @@ -100,15 +100,26 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, # NB: file_exists will only check for the existence of the config file on # hf_hub. This will fail in offline mode. - try: - return file_exists(model, - config_name, - revision=revision, - token=HF_TOKEN) - except huggingface_hub.errors.OfflineModeIsEnabled: - # Don't raise in offline mode, all we know is that we don't have this - # file cached. - return False + + # Call HF to check if the file exists, with 3 retries and exponential backoff + max_retries = 3 + retry_delay = 2 + for attempt in range(max_retries): + try: + return file_exists(model, + config_name, + revision=revision, + token=HF_TOKEN) + except huggingface_hub.errors.OfflineModeIsEnabled: + # Don't raise in offline mode, all we know is that we don't have this + # file cached. + return False + except Exception as e: + logger.error(f"Error checking file existence: {e}") + if attempt == max_retries - 1: + raise + time.sleep(retry_delay) + retry_delay *= 2 def patch_rope_scaling(config: PretrainedConfig) -> None: @@ -193,10 +204,22 @@ def get_config( # raise an offline mode error to indicate to the user that they # don't have files cached and may need to go online. # This is conveniently triggered by calling file_exists(). - file_exists(model, - HF_CONFIG_NAME, - revision=revision, - token=HF_TOKEN) + + # Call HF to check if the file exists, with 3 retries and exponential backoff + max_retries = 3 + retry_delay = 2 + for attempt in range(max_retries): + try: + file_exists(model, + HF_CONFIG_NAME, + revision=revision, + token=HF_TOKEN) + except Exception as e: + logger.error(f"Error checking file existence: {e}") + if attempt == max_retries - 1: + raise e + time.sleep(retry_delay) + retry_delay *= 2 raise ValueError(f"No supported config format found in {model}") From a04ad7a9184e4221a177096dd5d0e8fc0e8db17f Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Feb 2025 06:05:42 +0000 Subject: [PATCH 2/3] add log error for retry Signed-off-by: kevin --- vllm/transformers_utils/config.py | 34 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index b45835886574..c30c670f8291 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -3,9 +3,10 @@ import enum import json import os +import time from pathlib import Path from typing import Any, Dict, Literal, Optional, Type, Union -import time + import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, list_repo_files, try_to_load_from_cache) @@ -101,22 +102,27 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, # NB: file_exists will only check for the existence of the config file on # hf_hub. This will fail in offline mode. - # Call HF to check if the file exists, with 3 retries and exponential backoff - max_retries = 3 + # Call HF to check if the file exists + # 2 retries and exponential backoff + max_retries = 2 retry_delay = 2 for attempt in range(max_retries): try: return file_exists(model, - config_name, - revision=revision, - token=HF_TOKEN) + config_name, + revision=revision, + token=HF_TOKEN) except huggingface_hub.errors.OfflineModeIsEnabled: - # Don't raise in offline mode, all we know is that we don't have this + # Don't raise in offline mode, + # all we know is that we don't have this # file cached. return False except Exception as e: - logger.error(f"Error checking file existence: {e}") + logger.error( + "Error checking file existence: %s, retrying %d of %d", e, + attempt + 1, max_retries) if attempt == max_retries - 1: + logger.error("Error checking file existence: %s", e) raise time.sleep(retry_delay) retry_delay *= 2 @@ -205,8 +211,9 @@ def get_config( # don't have files cached and may need to go online. # This is conveniently triggered by calling file_exists(). - # Call HF to check if the file exists, with 3 retries and exponential backoff - max_retries = 3 + # Call HF to check if the file exists + # 2 retries and exponential backoff + max_retries = 2 retry_delay = 2 for attempt in range(max_retries): try: @@ -215,8 +222,11 @@ def get_config( revision=revision, token=HF_TOKEN) except Exception as e: - logger.error(f"Error checking file existence: {e}") - if attempt == max_retries - 1: + logger.error( + "Error checking file existence: %s, retrying %d of %d", + e, attempt + 1, max_retries) + if attempt == max_retries: + logger.error("Error checking file existence: %s", e) raise e time.sleep(retry_delay) retry_delay *= 2 From a5f5abf8345163f5a8350a07702e588e352d936c Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Feb 2025 06:23:49 +0000 Subject: [PATCH 3/3] fix lint Signed-off-by: kevin --- vllm/transformers_utils/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index c30c670f8291..aade28610b31 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -126,6 +126,8 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, raise time.sleep(retry_delay) retry_delay *= 2 + continue + return False def patch_rope_scaling(config: PretrainedConfig) -> None: