diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 928f59d4..bf0cec35 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -29,6 +29,7 @@ class ModalGPU(Enum): A100 = "A100" H100 = "H100" B200 = "B200" + H200 = "H200" @dataclasses.dataclass @@ -115,6 +116,7 @@ class RankCriterion(Enum): "L4": "80", "A100": "80", "H100": "90a", + "H200": "90a", "B200": "100", "NVIDIA": None, "MI300": None, diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index 5766e8a2..d4f70feb 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -19,6 +19,8 @@ tag = f"{cuda_version}-{flavor}-{operating_sys}" # Move this to another file later: + + cuda_image = ( Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.11") .apt_install( @@ -50,6 +52,7 @@ .pip_install("requests") ) + cuda_image = cuda_image.add_local_python_source( "consts", "modal_runner", @@ -57,6 +60,12 @@ "run_eval", ) +cuda_image_b200 = ( + Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu24.04", add_python="3.11") + .pip_install("ninja", "packaging", "requests") + .pip_install("torch==2.7.0", extra_index_url="https://download.pytorch.org/whl/cu128") +) + class TimeoutException(Exception): pass diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py index 75cd45bf..725e63e6 100644 --- a/src/discord-cluster-manager/modal_runner_archs.py +++ b/src/discord-cluster-manager/modal_runner_archs.py @@ -1,42 +1,14 @@ # This file contains wrapper functions for running # Modal apps on specific devices. We will fix this later. -from modal_runner import app, cuda_image, modal_run_config -from modal_utils import deserialize_full_result -from run_eval import FullResult, SystemInfo +from modal_runner import app, cuda_image, cuda_image_b200, modal_run_config -gpus = ["T4", "L4", "A100-80GB", "H100!"] +gpus = ["T4", "L4", "A100-80GB", "H100!", "B200", "H200"] for gpu in gpus: gpu_slug = gpu.lower().split("-")[0].strip("!") app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)( modal_run_config ) - app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)( + img = cuda_image if gpu != "B200" else cuda_image_b200 + app.function(gpu=gpu, image=img, name=f"run_pytorch_script_{gpu_slug}", serialized=True)( modal_run_config ) - - -@app.function(image=cuda_image, max_containers=1, timeout=600) -def run_pytorch_script_b200(config: dict, timeout: int = 300): - """Send a config and timeout to the server and return the response.""" - import requests - - ip_addr = "34.59.196.5" - port = "33001" - - payload = {"config": config, "timeout": timeout} - - try: - response = requests.post(f"http://{ip_addr}:{port}", json=payload, timeout=timeout + 5) - response.raise_for_status() - print("ORIGINAL", response.json()) - - print("DESERIALIZED", deserialize_full_result(response.json())) - return deserialize_full_result(response.json()) - except requests.RequestException as e: - return FullResult(success=False, error=str(e), runs={}, system=SystemInfo()) - - -@app.local_entrypoint() -def test_b200(timeout: int = 300): - config = {} - run_pytorch_script_b200.remote(config, timeout)