Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 107 additions & 126 deletions tests/integration/defs/agg_unit_mem_df.csv

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions tests/integration/defs/test_unittests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import warnings
from subprocess import CalledProcessError

from defs.conftest import tests_path
Expand Down Expand Up @@ -100,8 +101,8 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
num_workers = parallel_dict[cur_key]
num_workers = min(num_workers, 8)
else:
print(
f'unittest {case} on "{gpu_name}" is not recorded in parallel config. Need to profile.'
warnings.warn(
f'Cannot find parallel config entry for unittest {case} on "{gpu_name}". Fallback to serial test. Please add config entry to agg_unit_mem_df.csv.'
)

num_workers = max(1, num_workers)
Expand Down
16 changes: 16 additions & 0 deletions tests/integration/defs/trt_test_alternative.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,32 @@ def cleanup_process_tree(p: subprocess.Popen,
time.sleep(5)

lines = []
torch_inductors = []

for pid in sorted(target_pids):
try:
sp = psutil.Process(pid)
if verbose_message:
cmdline = sp.cmdline()

# Detect repetitive torch inductor worker processes
if len(cmdline) > 3 and \
'python' in cmdline[0] and \
'torch/_inductor/compile_worker/__main__.py' in cmdline[1] and \
'--pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler' == cmdline[2]:
torch_inductors.append(pid)
continue

lines.append(f"{pid}: {cmdline}")
persist_pids.append(pid)
except psutil.Error:
pass

if torch_inductors:
lines.append(
f"{len(torch_inductors)}*torch inductor workers: {torch_inductors}"
)

if persist_pids:
msg = f"Found leftover subprocesses: {persist_pids} launched by {p.args}"
if verbose_message:
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/speculative/test_draft_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):

max_batch_size = 2
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def test_dynamic_spec_decode():

max_batch_size = 1
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=True,
free_gpu_memory_fraction=0.5)
kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(batch_sizes=[1])

llm_common_config = dict(
Expand Down
4 changes: 2 additions & 2 deletions tests/unittest/_torch/speculative/test_eagle3.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
max_batch_size = 1
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
free_gpu_memory_fraction=0.5)
max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down Expand Up @@ -190,7 +190,7 @@ def test_deepseek_eagle3():
max_batch_size = 16
max_draft_len = 3
kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
free_gpu_memory_fraction=0.5)
max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down
3 changes: 1 addition & 2 deletions tests/unittest/_torch/speculative/test_kv_cache_reuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ def test_kv_cache_reuse(use_cuda_graph: bool, attn_backend: str):
# that ref and spec does not match 100%
max_batch_size = 1
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=True,
free_gpu_memory_fraction=0.5)
kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/speculative/test_ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,

max_batch_size = 2
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/speculative/test_user_provided.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_llama_user_provided(disable_overlap_scheduler: bool,

max_batch_size = 2
max_draft_len = 4
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(
batch_sizes=[1]) if use_cuda_graph else None

Expand Down
30 changes: 29 additions & 1 deletion tests/unittest/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# # Force resource release after test

import os
import sys
import traceback
from typing import Any

Expand Down Expand Up @@ -100,6 +101,33 @@ def pytest_sessionstart(session):
import torch._inductor.async_compile # noqa: F401


@pytest.fixture(autouse=True)
def cuda_error_early_quit(capfd):
"""
Fixture to handle CUDA error.

CUDA error are usually persistent that requires restart process to recover.
Immediately stop the current worker when CUDA error occurred.
It will then be restarted by the master process.
"""
if torch.cuda.is_available() and os.environ.get("PYTEST_XDIST_WORKER",
None):
try:
yield
torch.cuda.synchronize()
except RuntimeError as e:
msg = str(e)
if 'CUDA error:' in msg:
with capfd.disabled():
traceback.print_exception(e, file=sys.stderr)
print("CUDA Error occurred, worker must quit now",
file=sys.stderr)
os._exit(1)
raise
else:
yield


@pytest.fixture(autouse=True)
def torch_empty_cache() -> None:
"""
Expand Down