diff --git a/.github/actions/get-runner/action.yml b/.github/actions/get-runner/action.yml index 36fc6c6bfc..f23c69d69d 100644 --- a/.github/actions/get-runner/action.yml +++ b/.github/actions/get-runner/action.yml @@ -9,17 +9,17 @@ outputs: value: ${{ steps.runner.outputs.render_id }} hostname: value: ${{ steps.runner.outputs.hostname }} - xpu_num: - value: ${{ steps.runner.outputs.xpu_num }} - cpus_per_xpu: - value: ${{ steps.runner.outputs.cpus_per_xpu }} + ZE_AFFINITY_MASK: + value: ${{ steps.cpu-gpu.outputs.ZE_AFFINITY_MASK }} + numactl_args: + value: ${{ steps.cpu-gpu.outputs.numactl_args }} pytest_extra_args: - value: ${{ steps.runner.outputs.pytest_extra_args }} + value: ${{ steps.cpu-gpu.outputs.pytest_extra_args }} runs: using: composite steps: - - name: Get runner + - name: Get runner id shell: bash -xe {0} id: runner run: | @@ -28,6 +28,9 @@ runs: echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT} echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT} echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT} + - name: Show runner info + shell: bash -xe {0} + run: | # show host info lscpu lshw -C display @@ -35,8 +38,12 @@ runs: df -h cat /etc/os-release uname -a - # clinfo hang and reboot system to recover + dpkg -l |grep -E 'intel-opencl-icd|libze-dev|libigc-dev' || true + # clinfo and reboot system to recover if hang timeout 120 clinfo --list || sudo reboot + - name: CPU frequency check + shell: bash -xe {0} + run: | scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq) if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then if [ "${scaling_governor}" != "performance" ];then @@ -52,20 +59,45 @@ runs: echo "[INFO] You do NOT have ROOT permission to set system config." echo " The frequency governor is ${scaling_governor}." fi + - name: Distribute CPUs and GPUs evently + id: cpu-gpu + shell: bash -xe {0} + run: | cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')" - xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{ + total_xpu_num="$(ls /sys/class/drm/card*/device/enable |wc -l |awk '{if($1 > 8){n=8}else{n=$1};print n}')" + online_xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{ if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1} }' |wc -l)" - cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')" - pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{ - if (x > 0) { - split(z, xpu_list, ","); - for (i=0;i 1) { + for (i=0;i 1) { + for (i=0;i