Skip to content

Commit 5c61860

Browse files
committed
Add profiling and analysis scripts
- Add fix_ncu_permissions.sh for NCU permission management - Add tools/profiling/post_process_ncu.py for NCU data analysis - Add vllm/v1/sample/random_utils.py for random sampling utilities - Remove obsolete SCV baseline files
1 parent 44866a3 commit 5c61860

File tree

7 files changed

+431
-961
lines changed

7 files changed

+431
-961
lines changed

fix_ncu_permissions.sh

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/bin/bash
2+
#
3+
# Fix NCU Permissions - Enable NVIDIA GPU Performance Counter Access
4+
#
5+
# NCU requires special permissions to access GPU performance counters.
6+
# This script enables those permissions.
7+
#
8+
9+
set -e
10+
11+
echo "=========================================="
12+
echo "Fixing NCU Permissions"
13+
echo "=========================================="
14+
echo ""
15+
16+
# Check if running as root
17+
if [ "$EUID" -eq 0 ]; then
18+
echo "✓ Running as root"
19+
else
20+
echo "⚠ Not running as root. You may need sudo for some operations."
21+
fi
22+
23+
echo ""
24+
echo "Enabling GPU performance counter access..."
25+
echo ""
26+
27+
# Method 1: Set profiling mode to unrestricted (temporary, lost on reboot)
28+
echo "Method 1: Temporary fix (until reboot)"
29+
echo "-----------------------------------------"
30+
if [ -f /proc/driver/nvidia/params ]; then
31+
echo "Setting NVreg_RestrictProfilingToAdminUsers=0..."
32+
if sudo sh -c 'echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" > /etc/modprobe.d/nvidia-profiling.conf'; then
33+
echo "✓ Modprobe config updated"
34+
echo ""
35+
echo "Reloading NVIDIA kernel module..."
36+
if sudo modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia && sudo modprobe nvidia; then
37+
echo "✓ NVIDIA module reloaded"
38+
else
39+
echo "⚠ Could not reload module. You may need to reboot."
40+
fi
41+
else
42+
echo "✗ Failed to update modprobe config"
43+
fi
44+
else
45+
echo "⚠ NVIDIA driver not found at /proc/driver/nvidia/params"
46+
fi
47+
48+
echo ""
49+
echo "Method 2: Immediate fix (current session only)"
50+
echo "-----------------------------------------"
51+
if [ -f /sys/module/nvidia/parameters/NVreg_RestrictProfilingToAdminUsers ]; then
52+
echo "Current value:"
53+
cat /sys/module/nvidia/parameters/NVreg_RestrictProfilingToAdminUsers
54+
echo ""
55+
56+
echo "Note: Cannot modify this sysfs parameter directly."
57+
echo "The modprobe configuration above will take effect after module reload or reboot."
58+
else
59+
echo "⚠ Parameter file not found"
60+
fi
61+
62+
echo ""
63+
echo "Method 3: Using nvidia-modprobe (if available)"
64+
echo "-----------------------------------------"
65+
if command -v nvidia-modprobe &> /dev/null; then
66+
echo "Running nvidia-modprobe..."
67+
sudo nvidia-modprobe || true
68+
echo "✓ Done"
69+
else
70+
echo "⚠ nvidia-modprobe not found"
71+
fi
72+
73+
echo ""
74+
echo "=========================================="
75+
echo "Verification"
76+
echo "=========================================="
77+
echo ""
78+
79+
# Test NCU access
80+
if command -v ncu &> /dev/null; then
81+
echo "Testing NCU access with a simple command..."
82+
if ncu --query-metrics 2>&1 | grep -q "dram__bytes"; then
83+
echo "✓ NCU can access performance counters!"
84+
else
85+
echo "⚠ NCU may still have permission issues"
86+
echo ""
87+
echo "Output from ncu --query-metrics:"
88+
ncu --query-metrics 2>&1 | head -20
89+
fi
90+
else
91+
echo "⚠ ncu command not found"
92+
fi
93+
94+
echo ""
95+
echo "=========================================="
96+
echo "Next Steps"
97+
echo "=========================================="
98+
echo ""
99+
echo "1. If the temporary fix worked, you can now run NCU profiling:"
100+
echo " ./run_ncu_bandwidth_test.sh"
101+
echo ""
102+
echo "2. To make the fix permanent across reboots:"
103+
echo " - The modprobe config has been created at:"
104+
echo " /etc/modprobe.d/nvidia-profiling.conf"
105+
echo " - It will be loaded on next boot"
106+
echo ""
107+
echo "3. If you still see permission errors, you may need to:"
108+
echo " - Reboot the system for changes to take effect"
109+
echo " - OR run the profiling command with sudo:"
110+
echo " sudo ./run_ncu_bandwidth_test.sh"
111+
echo ""
112+
echo "4. Alternative: Run the microbench directly with sudo:"
113+
echo " sudo python3 tools/profiling/run_nwor_microbench.py \\"
114+
echo " --scenario short --requests 8 --batches 2 --draft-tokens 4 \\"
115+
echo " --temperature 0.7 --nwor-modes off --scv-modes off \\"
116+
echo " --enable-ncu --ncu-metrics \"dram__bytes_write.sum\" \\"
117+
echo " --output test_ncu.json"
118+
echo ""
119+
120+
# Show current NVIDIA driver version
121+
echo "Current NVIDIA Driver Info:"
122+
echo "----------------------------"
123+
nvidia-smi --query-gpu=driver_version,name --format=csv,noheader 2>/dev/null || echo "nvidia-smi not available"
124+
echo ""
125+
126+
echo "Done!"

0 commit comments

Comments
 (0)