Skip to content

Commit c1973b6

Browse files
ngc92msaroufimS1ro1alexzhang13
authored
Nvidia-competition setup (profiling + B200 runners) (#368)
* Change runner from gpumode-nvidia-arc to Nvidia-A100 * Update nvidia-arc-health.yml * Update nvidia-arc-health.yml * Feat: run health on b200 * tmp * tmp * tmp * feat * feat * feat * replace nvidia workflow to point to our b200 cluster * Fix: container * Fix: python->python3 * Fix: add back deps * Fix: python->python3 * Fix: python->python3 * Add nvidia-smi * split profiling into rocm/ncu; small code improvements * profile each benchmark individually for cleaner traces * profile in tempdir * send profile results as attached files * don't spam alerts * include default ncu report * attempt at filtered ncu * formatting fix * fix tests * Fix: good error for profile via api * Fix: remove nvidia-smi from workflow * Fix: polling time to 15s * limit profiling report length * limit number of kernels to be profiled * stricter matching for kernel name lines * add an additional safety limit to ncu reports * fix * Fix: style --------- Co-authored-by: Mark Saroufim <[email protected]> Co-authored-by: S1ro1 <[email protected]> Co-authored-by: Alex Zhang <[email protected]>
1 parent bc63779 commit c1973b6

File tree

13 files changed

+373
-192
lines changed

13 files changed

+373
-192
lines changed

.github/workflows/nvidia-arc-health.yml

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,15 @@ on:
66
- cron: '0 2 * * *'
77
workflow_dispatch:
88
push:
9-
branches: [main]
109

1110
jobs:
1211
health-check:
13-
runs-on: [gpumode-nvidia-arc]
12+
runs-on: [nvidia-docker-b200-8-x86-64]
1413
timeout-minutes: 5
15-
container:
16-
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
1714

1815
steps:
19-
- name: Setup Python
20-
uses: actions/setup-python@v5
21-
with:
22-
python-version: '3.10'
23-
24-
- name: Install PyTorch
25-
run: |
26-
pip install torch
27-
2816
- name: GPU Health Check
29-
run: python -c "import torch; torch.randn(5, device='cuda')"
17+
run: python3 -c "import torch; torch.randn(5, device='cuda')"
3018

3119
env:
3220
CUDA_VISIBLE_DEVICES: 0

.github/workflows/nvidia_workflow.yml

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,11 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'
1919

2020
jobs:
2121
run:
22-
runs-on: [gpumode-nvidia-arc]
22+
runs-on: [nvidia-docker-b200-8-x86-64]
2323
timeout-minutes: 10
24-
container:
25-
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
2624
steps:
2725
- uses: actions/checkout@v3
2826

29-
- name: Setup Python
30-
uses: actions/setup-python@v5
31-
with:
32-
python-version: '3.10'
33-
34-
- name: Install uv
35-
uses: astral-sh/setup-uv@v3
36-
with:
37-
version: "latest"
38-
3927
- name: Create input files
4028
shell: bash
4129
run: |
@@ -49,30 +37,18 @@ jobs:
4937
# Now write to file (won't be logged since it's masked)
5038
echo "$PAYLOAD" > payload.json
5139
52-
- name: Install uv
53-
uses: astral-sh/setup-uv@v3
54-
with:
55-
version: "latest"
56-
57-
- name: Setup Python environment
40+
- name: Setup Virtual Environment and Install Dependencies
5841
shell: bash
5942
run: |
60-
uv venv .venv
61-
echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
62-
echo "$PWD/.venv/bin" >> $GITHUB_PATH
43+
pip install --upgrade pip
44+
pip install -r "requirements.txt"
45+
pip install -e .
6346
64-
if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
65-
cat > "requirements.txt" <<'EOL'
66-
${{ github.event.inputs.requirements }}
67-
EOL
68-
uv pip install -r "requirements.txt"
69-
fi
70-
uv pip install -e .
7147
7248
- name: Run script
7349
shell: bash
7450
run: |
75-
python src/runners/github-runner.py
51+
python3 src/runners/github-runner.py
7652
7753
- name: Upload training artifacts
7854
uses: actions/upload-artifact@v4
@@ -88,5 +64,3 @@ jobs:
8864
name: profile-data
8965
path: profile_data/*
9066
retention-days: 1
91-
env:
92-
CUDA_VISIBLE_DEVICES: 0

examples/eval.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
500500
return 112
501501

502502

503-
def _run_single_profile(test: TestCase) -> str:
503+
def _run_single_profile_torch(test: TestCase) -> str:
504504
"""
505-
Runs a single test case. Do not call directly
505+
Profiles a single benchmark using the torch profiler.
506506
"""
507507
from submission import custom_kernel
508508
from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
511511
data = generate_input(**test.args)
512512
torch.cuda.synchronize()
513513

514+
cloned = _clone_data(data, 0)
514515
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
515516
with nvtx_range("custom_kernel"):
516-
submission_output = custom_kernel(_clone_data(data, 0))
517+
submission_output = custom_kernel(cloned)
517518
torch.cuda.synchronize()
518519

519520
return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
520521

521522

523+
def _run_single_profile_ncu(test: TestCase) -> str:
524+
"""
525+
Profiles a single benchmark using ncu. Note: this does not
526+
invoke NCU; instead, it is expected that eval is launched
527+
under NCU, and this function will rurnthe kernel excactly
528+
once in the 'custom_kernel' nvtx range.
529+
"""
530+
from submission import custom_kernel
531+
532+
with nvtx_range("generate input"):
533+
data = generate_input(**test.args)
534+
torch.cuda.synchronize()
535+
536+
cloned = _clone_data(data, 0)
537+
with nvtx_range("custom_kernel"):
538+
submission_output = custom_kernel(cloned)
539+
torch.cuda.synchronize()
540+
541+
return ""
542+
543+
522544
def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
523545
"""
524546
Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
610632
"""
611633
world_size = test.args.get("world_size", None)
612634
if world_size is None:
613-
return pool.apply(_run_single_profile, (test,))
635+
if bool(os.getenv("POPCORN_NCU", "0")):
636+
return pool.apply(_run_single_profile_ncu, (test,))
637+
else:
638+
return pool.apply(_run_single_profile_torch, (test,))
614639
else:
615640
return run_multi_gpu_profile(pool, test, world_size)
616641

scripts/ci_test_cuda.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
1919
headers = header_files
2020

2121
eval_result = run_cuda_script(
22-
make_system_info(),
2322
sources,
2423
headers,
2524
arch=arch,
2625
mode=SubmissionMode.TEST.value,
2726
tests="size: 256; seed: 42\n",
27+
system=make_system_info(),
2828
**kwargs,
2929
)
3030
return eval_result.compilation, eval_result.run
@@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path):
195195

196196
# can also use generic flags argument
197197
result = run_cuda_script(
198-
make_system_info(),
199198
{"eval.cu": eval_cu, "submission.cu": sub},
200199
header_files,
201200
flags=["-I.", f"-I{tmp_path}"],
202201
mode=SubmissionMode.TEST.value,
203202
tests="size: 256; seed: 42\n",
203+
system=make_system_info(),
204204
)
205205

206206
assert result.compilation.success is True

scripts/ci_test_python.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212

1313
def run_pytorch_helper(sources: dict, tests=None, **kwargs):
1414
result = run_pytorch_script(
15-
make_system_info(),
1615
sources,
1716
"eval.py",
1817
mode=SubmissionMode.TEST.value,
1918
tests=tests or "size: 256; seed: 42\n",
19+
system=make_system_info(),
2020
**kwargs,
2121
)
2222
return result.run
@@ -45,7 +45,7 @@ def custom_kernel(input):
4545
run = run_pytorch_helper({**files, "submission.py": sub})
4646
assert run.success is True
4747
assert run.passed is False
48-
assert "python eval.py test" in run.command
48+
assert "python3 eval.py test" in run.command
4949
assert run.stdout == ""
5050
assert run.stderr == ""
5151

src/kernelbot/api/api_utils.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ async def display_report(self, title: str, report: RunResultReport):
189189
elif isinstance(part, Log):
190190
self.long_report += f"\n\n## {part.header}:\n"
191191
self.long_report += f"```\n{part.content}```"
192+
193+
192194
# ruff: noqa: C901
193195
async def to_submit_info(
194196
user_info: Any,
@@ -197,14 +199,12 @@ async def to_submit_info(
197199
leaderboard_name: str,
198200
gpu_type: str,
199201
db_context: LeaderboardDB,
200-
) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901
202+
) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901
201203
user_name = user_info["user_name"]
202204
user_id = user_info["user_id"]
203205

204206
try:
205-
submission_mode_enum: SubmissionMode = SubmissionMode(
206-
submission_mode.lower()
207-
)
207+
submission_mode_enum: SubmissionMode = SubmissionMode(submission_mode.lower())
208208
except ValueError:
209209
raise HTTPException(
210210
status_code=400,
@@ -222,6 +222,11 @@ async def to_submit_info(
222222
SubmissionMode.BENCHMARK,
223223
SubmissionMode.LEADERBOARD,
224224
]
225+
if submission_mode_enum == SubmissionMode.PROFILE:
226+
raise HTTPException(
227+
status_code=400,
228+
detail="Profile submissions are not currently supported via API, use Discord instead.",
229+
)
225230
if submission_mode_enum not in allowed_modes:
226231
raise HTTPException(
227232
status_code=400,
@@ -263,9 +268,7 @@ async def to_submit_info(
263268
except HTTPException:
264269
raise
265270
except Exception as e:
266-
raise HTTPException(
267-
status_code=400, detail=f"Error reading submission file: {e}"
268-
) from e
271+
raise HTTPException(status_code=400, detail=f"Error reading submission file: {e}") from e
269272

270273
try:
271274
submission_code = submission_content.decode("utf-8")

src/kernelbot/discord_reporter.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import discord
2-
from discord_utils import _send_split_log
2+
from discord_utils import _send_file, _send_split_log
33

44
from libkernelbot.report import (
5+
File,
56
Link,
67
Log,
78
MultiProgressReporter,
@@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport):
7071
message += part.text
7172
elif isinstance(part, Log):
7273
message = await _send_split_log(thread, message, part.header, part.content)
74+
elif isinstance(part, File):
75+
if len(message) > 0:
76+
await thread.send(message)
77+
await _send_file(thread, part.message, part.name, part.content)
78+
message = ""
7379
elif isinstance(part, Link):
7480
if len(message) > 0:
7581
await thread.send(message)

src/kernelbot/discord_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import functools
22
import logging
3+
from io import BytesIO
34

45
import discord
56

@@ -124,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
124125
else:
125126
if partial_message != "":
126127
chunks.append(partial_message)
127-
partial_message = line
128+
partial_message = line + "\n"
128129

129130
if partial_message != "":
130131
chunks.append(partial_message)
@@ -133,6 +134,10 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
133134
for i, chunk in enumerate(chunks):
134135
partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n"
135136
partial_message += f"```\n{limit_length(chunk, 1900)}```"
136-
await thread.send(partial_message)
137+
await thread.send(partial_message, silent=True)
137138

138139
return ""
140+
141+
142+
async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
143+
await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True)

src/libkernelbot/launchers/github.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ async def run_submission( # noqa: C901
143143
# Update profile artifact to the actual download URL.
144144
# For the GitHub launcher the profile_artifact currently just contains
145145
# the name of the artifact.
146-
if profile_res is not None:
146+
if profile_res is not None and "profile-data" in index:
147147
profile_res.download_url = index["profile-data"].public_download_url
148148

149149
res = EvalResult(
@@ -344,7 +344,7 @@ async def wait_for_completion(
344344
return
345345

346346
await callback(self)
347-
await asyncio.sleep(20) # Yield control while waiting
347+
await asyncio.sleep(15) # Yield control while waiting
348348
except TimeoutError:
349349
raise # Re-raise the specific TimeoutError from the timeout block
350350
except Exception as e:

0 commit comments

Comments
 (0)