@@ -9,17 +9,17 @@ outputs:
99 value : ${{ steps.runner.outputs.render_id }}
1010 hostname :
1111 value : ${{ steps.runner.outputs.hostname }}
12- xpu_num :
13- value : ${{ steps.runner .outputs.xpu_num }}
14- cpus_per_xpu :
15- value : ${{ steps.runner .outputs.cpus_per_xpu }}
12+ ZE_AFFINITY_MASK :
13+ value : ${{ steps.cpu-gpu .outputs.ZE_AFFINITY_MASK }}
14+ numactl_args :
15+ value : ${{ steps.cpu-gpu .outputs.numactl_args }}
1616 pytest_extra_args :
17- value : ${{ steps.runner .outputs.pytest_extra_args }}
17+ value : ${{ steps.cpu-gpu .outputs.pytest_extra_args }}
1818
1919runs :
2020 using : composite
2121 steps :
22- - name : Get runner
22+ - name : Get runner id
2323 shell : bash -xe {0}
2424 id : runner
2525 run : |
@@ -28,15 +28,22 @@ runs:
2828 echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
2929 echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
3030 echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
31+ - name : Show runner info
32+ shell : bash -xe {0}
33+ run : |
3134 # show host info
3235 lscpu
3336 lshw -C display
3437 free -h
3538 df -h
3639 cat /etc/os-release
3740 uname -a
38- # clinfo hang and reboot system to recover
41+ dpkg -l |grep -E 'intel-opencl-icd|libze-dev|libigc-dev' || true
42+ # clinfo and reboot system to recover if hang
3943 timeout 120 clinfo --list || sudo reboot
44+ - name : CPU frequency check
45+ shell : bash -xe {0}
46+ run : |
4047 scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq)
4148 if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then
4249 if [ "${scaling_governor}" != "performance" ];then
@@ -52,40 +59,63 @@ runs:
5259 echo "[INFO] You do NOT have ROOT permission to set system config."
5360 echo " The frequency governor is ${scaling_governor}."
5461 fi
62+ - name : Distribute CPUs and GPUs evently
63+ id : cpu-gpu
64+ shell : bash -xe {0}
65+ run : |
5566 cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')"
56- xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
67+ total_xpu_num="$(ls /sys/class/drm/card*/device/enable |wc -l |awk '{if($1 > 8){n=8}else{n=$1};print n}')"
68+ online_xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
5769 if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
5870 }' |wc -l)"
59- cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
60- pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
61- if (x > 0) {
62- split(z, xpu_list, ",");
63- for (i=0;i<x;i++) {
64- if (z != "") {
65- ze = xpu_list[i+1];
66- } else {
67- ze = i;
68- }
71+ # check the online GPUs
72+ if [ ${online_xpu_num} -ne ${total_xpu_num} ];then
73+ echo "[Warning] Some cards are offline!"
74+ fi
75+ cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${total_xpu_num}" '{printf c/x}')"
76+ # get all online GPUs
77+ xpu_list=""
78+ for id in $(seq 0 ${total_xpu_num})
79+ do
80+ xpu_detected="$(ZE_AFFINITY_MASK=$id clinfo --list |grep "Graphics" || true)"
81+ if [ "${xpu_detected}" != "" ] && [[ "${xpu_detected}" != *" UHD "* ]];then
82+ xpu_list+="${id},"
83+ fi
84+ done
85+ export ZE_AFFINITY_MASK=${xpu_list%,*}
86+ numactl_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
87+ split(z, xpu_list, ",");
88+ if (length(xpu_list) > 1) {
89+ for (i=0;i<length(xpu_list);i++) {
90+ printf(" ZE_AFFINITY_MASK=%d OMP_NUM_THREADS=%d numactl -l -C %d-%d ;", xpu_list[i+1], cx, i*cx, (i+1)*cx-1);
91+ }
92+ }else {
93+ printf(" numactl -l ");
94+ }
95+ }')"
96+ pytest_extra_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
97+ split(z, xpu_list, ",");
98+ if (length(xpu_list) > 1) {
99+ for (i=0;i<length(xpu_list);i++) {
100+ ze = xpu_list[i+1];
69101 printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
70102 ze, cx, i*cx, (i+1)*cx-1);
71103 }
72104 }else {
73105 printf(" -n 1 ");
74106 }
75107 }')"
76- echo "xpu_num =${xpu_num }" |tee -a ${GITHUB_OUTPUT}
77- echo "cpus_per_xpu =${cpus_per_xpu }" |tee -a ${GITHUB_OUTPUT}
108+ echo "ZE_AFFINITY_MASK =${ZE_AFFINITY_MASK }" |tee -a ${GITHUB_OUTPUT}
109+ echo "numactl_args =${numactl_args }" |tee -a ${GITHUB_OUTPUT}
78110 echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
79111 - name : Cleanup host
80112 shell : bash -xe {0}
81113 run : |
82114 # clean docker cache
83115 docker system prune -af || true
84- # clean workspace
85- ls -al
86- sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
116+ # fix potential permission issue
117+ sudo chown $(id -u):$(id -g) ${HOME} /tmp ./ -R
87118 cd ${RUNNER_WORKSPACE}/..
88119 if [ "${PWD}" != "/" ];then
89- ls -al
90- sudo chmod 777 -R . || true
120+ sudo chown $(id -u):$(id -g) . -R || true
91121 fi
0 commit comments