diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 13841d27..a619f8e1 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -6,27 +6,15 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: - branches: [main] jobs: health-check: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install PyTorch - run: | - pip install torch - - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b50ec044..dee90785 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -19,22 +19,20 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: run: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" + - name: nvidia-smi + shell: bash + run: | + nvidia-smi || echo "nvidia-smi failed" + + - name: ncu + shell: bash + run: | + ncu --version || echo "ncu failed" - name: Create input files shell: bash @@ -49,30 +47,18 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - - name: Setup Python environment + - name: Setup Virtual Environment and Install Dependencies shell: bash run: | - uv venv .venv - echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV - echo "$PWD/.venv/bin" >> $GITHUB_PATH + pip install --upgrade pip + pip install -r "requirements.txt" + pip install -e . - if [[ -n "${{ github.event.inputs.requirements }}" ]]; then - cat > "requirements.txt" <<'EOL' - ${{ github.event.inputs.requirements }} - EOL - uv pip install -r "requirements.txt" - fi - uv pip install -e . - name: Run script shell: bash run: | - python src/runners/github-runner.py + python3 src/runners/github-runner.py - name: Upload training artifacts uses: actions/upload-artifact@v4 @@ -88,5 +74,3 @@ jobs: name: profile-data path: profile_data/* retention-days: 1 - env: - CUDA_VISIBLE_DEVICES: 0 diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index c0897baf..de448784 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -22,7 +22,7 @@ class ProfileResult: # Public download URL of all files created by the profiler # This may also be configured later download_url: Optional[str] - #fmt: on + # fmt: on @dataclasses.dataclass @@ -351,9 +351,15 @@ def profile_program( "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }) + run_result = run_program( + call, + seed=seed, + timeout=timeout, + multi_gpu=multi_gpu, + extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) profile_result = None @@ -377,7 +383,7 @@ def profile_program( code_obj.rename(output_dir / code_obj.name) profile_result = ProfileResult( - profiler='rocPROF', + profiler="rocPROF", download_url=None, ) @@ -386,6 +392,7 @@ def profile_program( # TODO: Implement profiling for other platforms return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + def run_single_evaluation( system: SystemInfo, call: list[str], @@ -427,7 +434,7 @@ def run_single_evaluation( return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None -def make_system_info() -> SystemInfo: # noqa: C901 +def make_system_info() -> SystemInfo: # noqa: C901 info = SystemInfo() try: import torch @@ -448,14 +455,16 @@ def make_system_info() -> SystemInfo: # noqa: C901 info.gpu = subprocess.check_output( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8" ) - info.device_count = info.gpu.count('\n') + info.device_count = info.gpu.count("\n") info.runtime = "CUDA" except subprocess.CalledProcessError: # try again for HIP try: - rocm_info = json.loads(subprocess.check_output( - ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" - )) + rocm_info = json.loads( + subprocess.check_output( + ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" + ) + ) if len(rocm_info) > 0: info.gpu = next(rocm_info.__iter__())["Card Series"] @@ -587,7 +596,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, @@ -613,7 +622,7 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run, profile = run_single_evaluation(system, ["python", main], **kwargs) + run, profile = run_single_evaluation(system, ["python3", main], **kwargs) return EvalResult( start=start,