Skip to content

Commit e26f6a8

Browse files
authored
[CI] Auto detect the available GPU devices and distribute them with CPUs (#2183)
disable_build
1 parent b2d2462 commit e26f6a8

File tree

7 files changed

+75
-63
lines changed

7 files changed

+75
-63
lines changed

.github/actions/get-runner/action.yml

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,17 @@ outputs:
99
value: ${{ steps.runner.outputs.render_id }}
1010
hostname:
1111
value: ${{ steps.runner.outputs.hostname }}
12-
xpu_num:
13-
value: ${{ steps.runner.outputs.xpu_num }}
14-
cpus_per_xpu:
15-
value: ${{ steps.runner.outputs.cpus_per_xpu }}
12+
ZE_AFFINITY_MASK:
13+
value: ${{ steps.cpu-gpu.outputs.ZE_AFFINITY_MASK }}
14+
numactl_args:
15+
value: ${{ steps.cpu-gpu.outputs.numactl_args }}
1616
pytest_extra_args:
17-
value: ${{ steps.runner.outputs.pytest_extra_args }}
17+
value: ${{ steps.cpu-gpu.outputs.pytest_extra_args }}
1818

1919
runs:
2020
using: composite
2121
steps:
22-
- name: Get runner
22+
- name: Get runner id
2323
shell: bash -xe {0}
2424
id: runner
2525
run: |
@@ -28,15 +28,22 @@ runs:
2828
echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
2929
echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
3030
echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
31+
- name: Show runner info
32+
shell: bash -xe {0}
33+
run: |
3134
# show host info
3235
lscpu
3336
lshw -C display
3437
free -h
3538
df -h
3639
cat /etc/os-release
3740
uname -a
38-
# clinfo hang and reboot system to recover
41+
dpkg -l |grep -E 'intel-opencl-icd|libze-dev|libigc-dev' || true
42+
# clinfo and reboot system to recover if hang
3943
timeout 120 clinfo --list || sudo reboot
44+
- name: CPU frequency check
45+
shell: bash -xe {0}
46+
run: |
4047
scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq)
4148
if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then
4249
if [ "${scaling_governor}" != "performance" ];then
@@ -52,40 +59,63 @@ runs:
5259
echo "[INFO] You do NOT have ROOT permission to set system config."
5360
echo " The frequency governor is ${scaling_governor}."
5461
fi
62+
- name: Distribute CPUs and GPUs evently
63+
id: cpu-gpu
64+
shell: bash -xe {0}
65+
run: |
5566
cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')"
56-
xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
67+
total_xpu_num="$(ls /sys/class/drm/card*/device/enable |wc -l |awk '{if($1 > 8){n=8}else{n=$1};print n}')"
68+
online_xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
5769
if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
5870
}' |wc -l)"
59-
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
60-
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
61-
if (x > 0) {
62-
split(z, xpu_list, ",");
63-
for (i=0;i<x;i++) {
64-
if (z != "") {
65-
ze = xpu_list[i+1];
66-
} else {
67-
ze = i;
68-
}
71+
# check the online GPUs
72+
if [ ${online_xpu_num} -ne ${total_xpu_num} ];then
73+
echo "[Warning] Some cards are offline!"
74+
fi
75+
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${total_xpu_num}" '{printf c/x}')"
76+
# get all online GPUs
77+
xpu_list=""
78+
for id in $(seq 0 ${total_xpu_num})
79+
do
80+
xpu_detected="$(ZE_AFFINITY_MASK=$id clinfo --list |grep "Graphics" || true)"
81+
if [ "${xpu_detected}" != "" ] && [[ "${xpu_detected}" != *" UHD "* ]];then
82+
xpu_list+="${id},"
83+
fi
84+
done
85+
export ZE_AFFINITY_MASK=${xpu_list%,*}
86+
numactl_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
87+
split(z, xpu_list, ",");
88+
if (length(xpu_list) > 1) {
89+
for (i=0;i<length(xpu_list);i++) {
90+
printf(" ZE_AFFINITY_MASK=%d OMP_NUM_THREADS=%d numactl -l -C %d-%d ;", xpu_list[i+1], cx, i*cx, (i+1)*cx-1);
91+
}
92+
}else {
93+
printf(" numactl -l ");
94+
}
95+
}')"
96+
pytest_extra_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
97+
split(z, xpu_list, ",");
98+
if (length(xpu_list) > 1) {
99+
for (i=0;i<length(xpu_list);i++) {
100+
ze = xpu_list[i+1];
69101
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
70102
ze, cx, i*cx, (i+1)*cx-1);
71103
}
72104
}else {
73105
printf(" -n 1 ");
74106
}
75107
}')"
76-
echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
77-
echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
108+
echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT}
109+
echo "numactl_args=${numactl_args}" |tee -a ${GITHUB_OUTPUT}
78110
echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
79111
- name: Cleanup host
80112
shell: bash -xe {0}
81113
run: |
82114
# clean docker cache
83115
docker system prune -af || true
84-
# clean workspace
85-
ls -al
86-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
116+
# fix potential permission issue
117+
sudo chown $(id -u):$(id -g) ${HOME} /tmp ./ -R
87118
cd ${RUNNER_WORKSPACE}/..
88119
if [ "${PWD}" != "/" ];then
89-
ls -al
90-
sudo chmod 777 -R . || true
120+
sudo chown $(id -u):$(id -g) . -R || true
91121
fi

.github/actions/linux-e2etest/action.yml

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -56,25 +56,6 @@ runs:
5656
fi
5757
}
5858
run_with_shard() {
59-
for i in $(seq 0 $[ ${xpu_num} - 1 ])
60-
do
61-
cpu_list="$(echo "${cpus_per_xpu} ${i}" |awk '{printf("%d-%d", $1*$2, $1*$2+$1-1)}')"
62-
if [ "${ZE_AFFINITY_MASK}" != "" ];then
63-
xpu_list=($(echo ${ZE_AFFINITY_MASK} |sed 's/,/ /g'))
64-
xpu_id=${xpu_list[$[ ${i} + 1 ]]}
65-
else
66-
xpu_id=${i}
67-
fi
68-
numactl -l -C ${cpu_list} bash -x inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id} static ${xpu_num} ${i} "${models_list_args}" &
69-
done
70-
wait
71-
}
72-
export OMP_NUM_THREADS=${cpus_per_xpu}
73-
for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
74-
do
75-
if [ "${suite}" == "pt2e" ];then
76-
continue
77-
fi
7859
# set models to run
7960
if [ "${{ github.event_name }}" == "pull_request" ];then
8061
models_list_args="$(awk -F ',|;| ' '{printf(" -k %s ", $1)}' benchmarks/dynamo/$(echo ${suite} |sed 's/_models//')_models_list.txt)"
@@ -83,6 +64,24 @@ runs:
8364
else
8465
models_list_args=""
8566
fi
67+
# shards to run
68+
# Such as numactl_args=' ZE_AFFINITY_MASK=0 OMP_NUM_THREADS=12 numactl -l -C 0-11 ; ZE_AFFINITY_MASK=1 OMP_NUM_THREADS=12 numactl -l -C 12-23 ;'
69+
delimiter=";"
70+
IFS="${delimiter}" read -ra numactl_args_list <<< "${numactl_args}"
71+
instances=${#numactl_args_list[@]}
72+
for i in $(seq 0 $[ ${instances} - 1 ])
73+
do
74+
xpu_id="$(echo ${numactl_args_list[$i]} |sed 's/.*ZE_AFFINITY_MASK=//;s/ .*//')"
75+
eval ${numactl_args_list[$i]} bash -x inductor_xpu_test.sh \
76+
${suite} ${dt} ${mode} ${scenario} \
77+
xpu ${xpu_id} static ${instances} ${i} \
78+
\"${models_list_args}\" \
79+
&
80+
done
81+
wait
82+
}
83+
for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
84+
do
8685
if ! contains "huggingface,timm_models,torchbench" "$suite"; then
8786
continue
8887
fi

.github/workflows/_linux_build.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ jobs:
7474
run: |
7575
cat /etc/os-release
7676
hostname && id
77-
# Cleanup workspace
78-
find ./ |grep -v "^\./$" |xargs rm -rf
7977
# install gh
8078
dnf install -y 'dnf-command(config-manager)'
8179
dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
@@ -219,4 +217,3 @@ jobs:
219217
if: ${{ always() }}
220218
run: |
221219
chmod 777 /__w /github ./ -R
222-
find ./ |grep -v "^\./$" |xargs rm -rf

.github/workflows/_linux_e2e.yml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,8 @@ jobs:
6161
user_id: ${{ steps.runner-info.outputs.user_id }}
6262
render_id: ${{ steps.runner-info.outputs.render_id }}
6363
hostname: ${{ steps.runner-info.outputs.hostname }}
64-
xpu_num: ${{ steps.runner-info.outputs.xpu_num }}
65-
cpus_per_xpu: ${{ steps.runner-info.outputs.cpus_per_xpu }}
64+
numactl_args: ${{ steps.runner-info.outputs.numactl_args }}
6665
steps:
67-
- name: Cleanup workspace
68-
run: |
69-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
7066
- name: Checkout torch-xpu-ops
7167
uses: actions/checkout@v4
7268
- name: Get runner
@@ -84,8 +80,7 @@ jobs:
8480
options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
8581
-u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
8682
env:
87-
xpu_num: ${{ needs.runner.outputs.xpu_num }}
88-
cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
83+
numactl_args: ${{ needs.runner.outputs.numactl_args }}
8984
MODEL_ONLY_NAME: ${{ inputs.model }}
9085
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
9186
steps:

.github/workflows/_linux_op_benchmark.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ jobs:
3939
render_id: ${{ steps.runner-info.outputs.render_id }}
4040
hostname: ${{ steps.runner-info.outputs.hostname }}
4141
steps:
42-
- name: Cleanup workspace
43-
run: |
44-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
4542
- name: Checkout torch-xpu-ops
4643
uses: actions/checkout@v4
4744
- name: Get runner

.github/workflows/_linux_ut.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ jobs:
4747
hostname: ${{ steps.runner-info.outputs.hostname }}
4848
pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
4949
steps:
50-
- name: Cleanup workspace
51-
run: |
52-
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
5350
- name: Checkout torch-xpu-ops
5451
uses: actions/checkout@v4
5552
- name: Get runner

.github/workflows/_performance_comparison.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ jobs:
2222
GH_TOKEN: ${{ github.token }}
2323
runs-on: ubuntu-24.04
2424
steps:
25-
- name: Cleanup workspace
26-
run: |
27-
rm -rf ./target ./baseline
2825
- name: Setup python
2926
uses: actions/setup-python@v5
3027
with:

0 commit comments

Comments
 (0)