Skip to content
Closed
16 changes: 2 additions & 14 deletions .github/workflows/nvidia-arc-health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,15 @@ on:
- cron: '0 2 * * *'
workflow_dispatch:
push:
branches: [main]

jobs:
health-check:
runs-on: [gpumode-nvidia-arc]
runs-on: [nvidia-docker-b200-8-x86-64]
timeout-minutes: 5
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04

steps:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install PyTorch
run: |
pip install torch

- name: GPU Health Check
run: python -c "import torch; torch.randn(5, device='cuda')"
run: python3 -c "import torch; torch.randn(5, device='cuda')"

env:
CUDA_VISIBLE_DEVICES: 0
46 changes: 15 additions & 31 deletions .github/workflows/nvidia_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,20 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'

jobs:
run:
runs-on: [gpumode-nvidia-arc]
runs-on: [nvidia-docker-b200-8-x86-64]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
- name: nvidia-smi
shell: bash
run: |
nvidia-smi || echo "nvidia-smi failed"
- name: ncu
shell: bash
run: |
ncu --version || echo "ncu failed"

- name: Create input files
shell: bash
Expand All @@ -49,30 +47,18 @@ jobs:
# Now write to file (won't be logged since it's masked)
echo "$PAYLOAD" > payload.json

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"

- name: Setup Python environment
- name: Setup Virtual Environment and Install Dependencies
shell: bash
run: |
uv venv .venv
echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
pip install --upgrade pip
pip install -r "requirements.txt"
pip install -e .

if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
cat > "requirements.txt" <<'EOL'
${{ github.event.inputs.requirements }}
EOL
uv pip install -r "requirements.txt"
fi
uv pip install -e .

- name: Run script
shell: bash
run: |
python src/runners/github-runner.py
python3 src/runners/github-runner.py

- name: Upload training artifacts
uses: actions/upload-artifact@v4
Expand All @@ -88,5 +74,3 @@ jobs:
name: profile-data
path: profile_data/*
retention-days: 1
env:
CUDA_VISIBLE_DEVICES: 0
33 changes: 21 additions & 12 deletions src/libkernelbot/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ProfileResult:
# Public download URL of all files created by the profiler
# This may also be configured later
download_url: Optional[str]
#fmt: on
# fmt: on


@dataclasses.dataclass
Expand Down Expand Up @@ -351,9 +351,15 @@ def profile_program(
"--",
] + call

run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
"GPU_DUMP_CODE_OBJECT": "1",
})
run_result = run_program(
call,
seed=seed,
timeout=timeout,
multi_gpu=multi_gpu,
extra_env={
"GPU_DUMP_CODE_OBJECT": "1",
},
)

profile_result = None

Expand All @@ -377,7 +383,7 @@ def profile_program(
code_obj.rename(output_dir / code_obj.name)

profile_result = ProfileResult(
profiler='rocPROF',
profiler="rocPROF",
download_url=None,
)

Expand All @@ -386,6 +392,7 @@ def profile_program(
# TODO: Implement profiling for other platforms
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None


def run_single_evaluation(
system: SystemInfo,
call: list[str],
Expand Down Expand Up @@ -427,7 +434,7 @@ def run_single_evaluation(
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None


def make_system_info() -> SystemInfo: # noqa: C901
def make_system_info() -> SystemInfo: # noqa: C901
info = SystemInfo()
try:
import torch
Expand All @@ -448,14 +455,16 @@ def make_system_info() -> SystemInfo: # noqa: C901
info.gpu = subprocess.check_output(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8"
)
info.device_count = info.gpu.count('\n')
info.device_count = info.gpu.count("\n")
info.runtime = "CUDA"
except subprocess.CalledProcessError:
# try again for HIP
try:
rocm_info = json.loads(subprocess.check_output(
["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
))
rocm_info = json.loads(
subprocess.check_output(
["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
)
)
if len(rocm_info) > 0:
info.gpu = next(rocm_info.__iter__())["Card Series"]

Expand Down Expand Up @@ -587,7 +596,7 @@ def run_pytorch_script( # noqa: C901
# "compile" step: execute the script once. Will populate
# `load_inline`'s compile cache, so the actual runs will be faster.
try:
compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE)
if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
comp = CompileResult(
nvcc_found=True,
Expand All @@ -613,7 +622,7 @@ def run_pytorch_script( # noqa: C901
exit_code=e.returncode,
)

run, profile = run_single_evaluation(system, ["python", main], **kwargs)
run, profile = run_single_evaluation(system, ["python3", main], **kwargs)

return EvalResult(
start=start,
Expand Down
Loading