Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions docs/source/user_guide/additional_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ The following table lists the additional configuration options available in vLLM
| ---- | ---- | ------- | ----------- |
| `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode |
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
| `expert_tensor_parallel_size` | str | `1` | Expert tensor parallel size the model to use. |
| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QQ: Does vLLM has similar config?

Copy link
Collaborator Author

@wangxiyuan wangxiyuan Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not, this value is only used for rlhf, for verl or someother framework, the case is:

  1. verl load and update vllm config
  2. verl start LLM with additional config in external_executor mode

in the fisrt step, the ascend config has been initialized, then in the second step, the additional config will be skipped.

To solve the problem, we should let verl pass refresh, the we can regenerate the config


The details of each config option are as follows:

Expand All @@ -40,6 +41,7 @@ The details of each config option are as follows:
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert |

**ascend_scheduler_config**

Expand All @@ -59,12 +61,14 @@ A full example of additional configuration is as follows:
"enabled": true,
"use_cached_graph": true,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": true
"graph_batch_sizes_init": false,
"enable_multistream_shared_expert": false
},
"ascend_scheduler_config": {
"enabled": true,
"chunked_prefill_enabled": true,
},
"expert_tensor_parallel_size": 1
"expert_tensor_parallel_size": 1,
"refresh": false,
}
```
78 changes: 74 additions & 4 deletions tests/singlecard/test_ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest

from tests.conftest import VllmRunner
from vllm_ascend.ascend_config import clear_ascend_config, get_ascend_config
from vllm_ascend.ascend_config import (clear_ascend_config, get_ascend_config,
init_ascend_config)


def _clean_up_ascend_config(func):
Expand All @@ -39,28 +42,35 @@ def test_run_without_ascend_config():
assert ascend_config.torchair_graph_config.graph_batch_sizes == []
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert not ascend_config.ascend_scheduler_config.enabled
assert ascend_config.expert_tensor_parallel_size == 1
assert ascend_config.expert_tensor_parallel_size == 0


@_clean_up_ascend_config
def test_run_with_ascend_config():
input_additional_config = {
if os.getenv("VLLM_USE_V1") == "0":
pytest.skip("graph only works on v1")

input_additional_config_1 = {
"torchair_graph_config": {
# torchair graph only works with deepseek. The e2e test should be added
# in multicard test with deepseek models.
"enabled": False,
"use_cached_graph": True,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": False,
"enable_multistream_shared_expert": True,
},
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
},
"expert_tensor_parallel_size": 1
}

# check passed with eager mode
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config):
enforce_eager=True,
additional_config=input_additional_config_1):
ascend_config = get_ascend_config()

assert not ascend_config.torchair_graph_config.enabled
Expand All @@ -69,6 +79,7 @@ def test_run_with_ascend_config():
1, 2, 4, 8
]
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
assert ascend_config.torchair_graph_config.enable_multistream_shared_expert
assert ascend_config.ascend_scheduler_config.enabled
assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
assert ascend_config.expert_tensor_parallel_size == 1
Expand All @@ -83,6 +94,8 @@ def test_ascend_config_init_error():

@_clean_up_ascend_config
def test_ascend_config_load_error():
if os.getenv("VLLM_USE_V1") == "0":
pytest.skip("graph only works on v1")
# graph_batch_sizes should be list.
with pytest.raises(TypeError):
input_additional_config_fake_1 = {
Expand Down Expand Up @@ -117,3 +130,60 @@ def test_ascend_config_load_error():
enforce_eager=False,
additional_config=input_additional_config_fake_2):
pass

# torchair graph should not be enabled with eager mode
with pytest.raises(RuntimeError):
input_additional_config_fake_3 = {
"torchair_graph_config": {
"enabled": True,
},
}
with VllmRunner("facebook/opt-125m",
enforce_eager=True,
additional_config=input_additional_config_fake_3):
pass


@_clean_up_ascend_config
def test_check_ascend_config_v0():
if os.getenv("VLLM_USE_V1") == "1":
pytest.skip("graph only works on v1, this is the test for v0")
with pytest.raises(NotImplementedError):
input_additional_config_fake_1 = {
"torchair_graph_config": {
"enabled": True,
},
}
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config_fake_1):
pass


@_clean_up_ascend_config
def test_ascend_config_refresh():
from vllm.config import get_current_vllm_config
vllm_config = get_current_vllm_config()
# set additional_config with none
init_ascend_config(vllm_config)

input_additional_config = {
"torchair_graph_config": {
"enabled": False,
"use_cached_graph": True,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": False,
},
"refresh": True,
}

# refresh ascend config
with VllmRunner("facebook/opt-125m",
additional_config=input_additional_config):
ascend_config = get_ascend_config()

assert not ascend_config.torchair_graph_config.enabled
assert ascend_config.torchair_graph_config.use_cached_graph
assert ascend_config.torchair_graph_config.graph_batch_sizes == [
1, 2, 4, 8
]
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
86 changes: 53 additions & 33 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, vllm_config):
ascend_scheduler_config)

self.expert_tensor_parallel_size = int(
additional_config.get("expert_tensor_parallel_size", 1))
additional_config.get("expert_tensor_parallel_size", 0))


class TorchairGraphConfig:
Expand Down Expand Up @@ -82,8 +82,11 @@ def __init__(self, ascend_scheduler_config: dict):


def init_ascend_config(vllm_config):
additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
refresh = additional_config.get("refresh",
False) if additional_config else False
global _ASCEND_CONFIG
if _ASCEND_CONFIG is not None:
if _ASCEND_CONFIG is not None and not refresh:
return _ASCEND_CONFIG
_ASCEND_CONFIG = AscendConfig(vllm_config)
return _ASCEND_CONFIG
Expand All @@ -106,35 +109,52 @@ def get_ascend_config():
def check_ascend_config(vllm_config, enforce_eager):
ascend_config = get_ascend_config()

# Both for V0 and V1 Engine, torchair_graph cannot be enabled with eager mode.
if ascend_config.torchair_graph_config.enabled and enforce_eager:
raise RuntimeError(
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
)

# torchair_graph only work with deepseek model and mla enabled.
if ascend_config.torchair_graph_config.enabled:
if envs.VLLM_MLA_DISABLE:
logger.warning(
"Torchair graph mode is still experimental and not supported for V1 without mla currently, "
"it has been disabled automatically.")
ascend_config.ascend_scheduler_config.enabled = False
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" not in model_type:
raise NotImplementedError(
"Torchair graph mode only works with deepseek model.")

# for V1 Engine, aclgraph doesn't work with deepseek model and only qwen model is well tested.
if envs.VLLM_USE_V1 and vllm_config.model_config is not None and not enforce_eager:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" in model_type:
# for v0 engine
if not envs.VLLM_USE_V1:
if ascend_config.torchair_graph_config.enabled:
raise NotImplementedError(
"Torchair graph mode is only supported for V1 Engine.")
if ascend_config.ascend_scheduler_config.enabled:
raise NotImplementedError(
"ACL Graph does not support deepseek. Please "
"try torchair graph mode to serve deepseek models on vllm-ascend."
" Or set `enforce_eager=True` to use eager mode.")
if "qwen" not in model_type:
logger.warning(
"ACL Graph is currently experimental. Please "
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
" if you encourage any Error")
"Ascend scheduler is only supported for V1 Engine.")
# for v1 engine
else:
# for eager mode
if enforce_eager:
# torchair_graph cannot be enabled with eager mode.
if ascend_config.torchair_graph_config.enabled:
raise RuntimeError(
"Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
)
# for graph mode
else:
# torchair_graph case
if ascend_config.torchair_graph_config.enabled:
# torchair_graph is not supported for V1 without mla currently.
if envs.VLLM_MLA_DISABLE:
logger.warning(
"Torchair graph mode is still experimental and not supported for V1 without mla currently, "
"it has been disabled automatically.")
ascend_config.torchair_graph_config.enabled = False
# torchair_graph is supported for deepseek model only currently.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" not in model_type:
raise NotImplementedError(
"Torchair graph mode only works with deepseek model."
)
# aclgraph case
else:
# aclgraph doesn't work with deepseek model and only qwen model is well tested.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if "deepseek" in model_type:
raise NotImplementedError(
"ACL Graph does not support deepseek. Please "
"try torchair graph mode to serve deepseek models on vllm-ascend."
" Or set `enforce_eager=True` to use eager mode.")
if "qwen" not in model_type:
logger.warning(
"ACL Graph is currently experimental. Please "
"raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
" if you encourage any Error")
2 changes: 1 addition & 1 deletion vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:

# NOTE: When enable_expert_parallel is True, we follow vLLM convention:
# ep_size = world_size, which means expert_tensor_parallel_size must be 1
if ascend_config.expert_tensor_parallel_size > 1 and not parallel_config.enable_expert_parallel:
if ascend_config.expert_tensor_parallel_size > 0 and not parallel_config.enable_expert_parallel:
parallel_config.expert_tensor_parallel_size = ascend_config.expert_tensor_parallel_size

# Calculate expert parallel size based on world size
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):

ascend_config = get_ascend_config()
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled and self.vllm_config.model_config.use_mla
self.torchair_graph_use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph
self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes

if ascend_config.torchair_graph_config.graph_batch_sizes_init:
Expand Down