From cf5b0481cbf1b8c12dbd417e5ac758c48a38ab76 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sat, 31 May 2025 23:03:37 +0200 Subject: [PATCH] initial work towards having a standalone runner --- examples/identity_py/submission.py | 2 +- src/discord-cluster-manager/bot.py | 3 +- src/discord-cluster-manager/cogs/admin_cog.py | 3 +- src/discord-cluster-manager/consts.py | 7 ++- .../launchers/__init__.py | 2 +- .../launchers/generic.py | 63 +++++++++++++++++++ .../standalone-init.sh | 6 ++ .../standalone-runner.py | 53 ++++++++++++++++ 8 files changed, 134 insertions(+), 5 deletions(-) create mode 100644 src/discord-cluster-manager/launchers/generic.py create mode 100644 src/discord-cluster-manager/standalone-init.sh create mode 100644 src/discord-cluster-manager/standalone-runner.py diff --git a/examples/identity_py/submission.py b/examples/identity_py/submission.py index 0ef8b529..1a7a7e24 100644 --- a/examples/identity_py/submission.py +++ b/examples/identity_py/submission.py @@ -1,4 +1,4 @@ -#!POPCORN leaderboard identity_py +#!POPCORN leaderboard identity_py-dev from task import input_t, output_t diff --git a/src/discord-cluster-manager/bot.py b/src/discord-cluster-manager/bot.py index dfb64b75..5680b8cf 100644 --- a/src/discord-cluster-manager/bot.py +++ b/src/discord-cluster-manager/bot.py @@ -26,7 +26,7 @@ POSTGRES_USER, init_environment, ) -from launchers import GitHubLauncher, ModalLauncher +from launchers import GitHubLauncher, ModalLauncher, GenericLauncher from leaderboard_db import LeaderboardDB from utils import setup_logging @@ -80,6 +80,7 @@ async def setup_hook(self): submit_cog = SubmitCog(self) submit_cog.register_launcher(ModalLauncher(consts.MODAL_CUDA_INCLUDE_DIRS)) submit_cog.register_launcher(GitHubLauncher(env.GITHUB_REPO, env.GITHUB_TOKEN)) + submit_cog.register_launcher(GenericLauncher("http://65.108.32.167:8000/run", token='TOKEN')) await self.add_cog(submit_cog) await self.add_cog(BotManagerCog(self)) await self.add_cog(LeaderboardCog(self)) diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 8e39ee2f..ed6b8fd3 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -10,7 +10,7 @@ import discord import env import yaml -from consts import GitHubGPU, ModalGPU +from consts import GitHubGPU, ModalGPU, OtherGPU from discord import app_commands from discord.ext import commands, tasks from leaderboard_db import leaderboard_name_autocomplete @@ -153,6 +153,7 @@ async def is_creator_check( @app_commands.choices( gpu=[app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in GitHubGPU] + [app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in ModalGPU] + + [app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in OtherGPU] ) @with_error_handling async def leaderboard_create_local( diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 1a9ac6e2..b9992233 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -31,6 +31,10 @@ class ModalGPU(Enum): B200 = "B200" +class OtherGPU(Enum): + A6000 = "A6000" + + @dataclasses.dataclass class GPU: name: str @@ -48,7 +52,7 @@ def _make_gpu_lookup(runner_map: dict[str, Type[Enum]]): return lookup -_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU}) +_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU, "Generic": OtherGPU}) def get_gpu_by_name(name: str) -> GPU: @@ -114,6 +118,7 @@ class RankCriterion(Enum): "T4": "75", "L4": "80", "A100": "80", + "A6000": "86", "H100": "90a", "B200": "100", "NVIDIA": None, diff --git a/src/discord-cluster-manager/launchers/__init__.py b/src/discord-cluster-manager/launchers/__init__.py index df47476f..8bf4ab8f 100644 --- a/src/discord-cluster-manager/launchers/__init__.py +++ b/src/discord-cluster-manager/launchers/__init__.py @@ -1,5 +1,5 @@ from .github import GitHubLauncher from .launcher import Launcher from .modal import ModalLauncher - +from .generic import GenericLauncher __all__ = [Launcher, GitHubLauncher, ModalLauncher] diff --git a/src/discord-cluster-manager/launchers/generic.py b/src/discord-cluster-manager/launchers/generic.py new file mode 100644 index 00000000..ce0805ca --- /dev/null +++ b/src/discord-cluster-manager/launchers/generic.py @@ -0,0 +1,63 @@ +# Generic launcher POSTs to a specific URL +import asyncio +import datetime +import json + +import requests + +from consts import GPU, OtherGPU +from report import RunProgressReporter +from run_eval import FullResult, CompileResult, RunResult, EvalResult, SystemInfo +from utils import setup_logging, KernelBotError + +from .launcher import Launcher + +logger = setup_logging(__name__) + + +class GenericLauncher(Launcher): + def __init__(self, url: str, token: str): + super().__init__("Generic", gpus=OtherGPU) + self.url = url + self.token = token + + async def run_submission( + self, config: dict, gpu_type: GPU, status: RunProgressReporter + ) -> FullResult: + loop = asyncio.get_event_loop() + logger.info(f"Calling {self.url}") + + await status.push("⏳ Waiting for run to finish...") + result = await loop.run_in_executor( + None, + lambda: requests.post(self.url, json={"config": config, "token": self.token}) + ) + + print(result.text) + + await status.update("✅ Waiting for run to finish... Done") + if result.status_code != 200: + logger.error("Error running submission. Status code %d, Message: %s", result.status_code, result.text) + raise KernelBotError(f"Error running submission. Status code {result.status_code}") + + # TODO: this code is duplicated :( + data = result.json() + runs = {} + # convert json back to EvalResult structures, which requires + # special handling for datetime and our dataclasses. + for k, v in data["runs"].items(): + if "compilation" in v and v["compilation"] is not None: + comp = CompileResult(**v["compilation"]) + else: + comp = None + run = RunResult(**v["run"]) + res = EvalResult( + start=datetime.datetime.fromisoformat(v["start"]), + end=datetime.datetime.fromisoformat(v["end"]), + compilation=comp, + run=run, + ) + runs[k] = res + + system = SystemInfo(**data.get("system", {})) + return FullResult(success=True, error="", runs=runs, system=system) diff --git a/src/discord-cluster-manager/standalone-init.sh b/src/discord-cluster-manager/standalone-init.sh new file mode 100644 index 00000000..e3b393be --- /dev/null +++ b/src/discord-cluster-manager/standalone-init.sh @@ -0,0 +1,6 @@ +apt install python3-pip +pip install uv --break-system-packages +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt +uv pip install torch numpy diff --git a/src/discord-cluster-manager/standalone-runner.py b/src/discord-cluster-manager/standalone-runner.py new file mode 100644 index 00000000..eff1fb43 --- /dev/null +++ b/src/discord-cluster-manager/standalone-runner.py @@ -0,0 +1,53 @@ +import asyncio +import os +from dataclasses import asdict +from pydantic import BaseModel + +import uvicorn + +from run_eval import run_config + +from fastapi import FastAPI, HTTPException + +app = FastAPI() + + +_serial_run = asyncio.Semaphore(1) +_runner_token = None + + +class RunRequest(BaseModel): + config: dict + token: str + + +@app.post("/run") +async def run(request: RunRequest) -> dict: + # only one submission can run at any given time + if request.token != _runner_token: + raise HTTPException(status_code=401, detail="Invalid token") + async with _serial_run: + return asdict(run_config(request.config)) + + +async def run_server(port): + config = uvicorn.Config( + app, + host="0.0.0.0", + port=port, + log_level="info", + limit_concurrency=2, + ) + server = uvicorn.Server(config) + + # we need this as discord and fastapi both run on the same event loop + await server.serve() + + +def main(): + with asyncio.Runner() as runner: + runner.run(run_server(port=int(os.environ.get("PORT") or 8000))) + + +if __name__ == "__main__": + main()