Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 55 additions & 25 deletions .github/actions/get-runner/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ outputs:
value: ${{ steps.runner.outputs.render_id }}
hostname:
value: ${{ steps.runner.outputs.hostname }}
xpu_num:
value: ${{ steps.runner.outputs.xpu_num }}
cpus_per_xpu:
value: ${{ steps.runner.outputs.cpus_per_xpu }}
ZE_AFFINITY_MASK:
value: ${{ steps.cpu-gpu.outputs.ZE_AFFINITY_MASK }}
numactl_args:
value: ${{ steps.cpu-gpu.outputs.numactl_args }}
pytest_extra_args:
value: ${{ steps.runner.outputs.pytest_extra_args }}
value: ${{ steps.cpu-gpu.outputs.pytest_extra_args }}

runs:
using: composite
steps:
- name: Get runner
- name: Get runner id
shell: bash -xe {0}
id: runner
run: |
Expand All @@ -28,15 +28,22 @@ runs:
echo "user_id=$(id -u)" |tee -a ${GITHUB_OUTPUT}
echo "render_id=$(getent group render |cut -d: -f3)" |tee -a ${GITHUB_OUTPUT}
echo "hostname=$(hostname)" |tee -a ${GITHUB_OUTPUT}
- name: Show runner info
shell: bash -xe {0}
run: |
# show host info
lscpu
lshw -C display
free -h
df -h
cat /etc/os-release
uname -a
# clinfo hang and reboot system to recover
dpkg -l |grep -E 'intel-opencl-icd|libze-dev|libigc-dev' || true
# clinfo and reboot system to recover if hang
timeout 120 clinfo --list || sudo reboot
- name: CPU frequency check
shell: bash -xe {0}
run: |
scaling_governor=$(cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor |sort |uniq)
if [ $(sudo -n true > /dev/null 2>&1 && echo $? || echo $?) -eq 0 ];then
if [ "${scaling_governor}" != "performance" ];then
Expand All @@ -52,40 +59,63 @@ runs:
echo "[INFO] You do NOT have ROOT permission to set system config."
echo " The frequency governor is ${scaling_governor}."
fi
- name: Distribute CPUs and GPUs evently
id: cpu-gpu
shell: bash -xe {0}
run: |
cpu_num="$(lscpu |grep -E 'Core\(s\) per socket:|Socket\(s\):' |awk 'BEGIN{sum=1}{sum*=$NF}END{printf sum}')"
xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
total_xpu_num="$(ls /sys/class/drm/card*/device/enable |wc -l |awk '{if($1 > 8){n=8}else{n=$1};print n}')"
online_xpu_num="$(clinfo --list |awk 'BEGIN{gpu=0;}{
if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
}' |wc -l)"
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
if (x > 0) {
split(z, xpu_list, ",");
for (i=0;i<x;i++) {
if (z != "") {
ze = xpu_list[i+1];
} else {
ze = i;
}
# check the online GPUs
if [ ${online_xpu_num} -ne ${total_xpu_num} ];then
echo "[Warning] Some cards are offline!"
fi
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${total_xpu_num}" '{printf c/x}')"
# get all online GPUs
xpu_list=""
for id in $(seq 0 ${total_xpu_num})
do
xpu_detected="$(ZE_AFFINITY_MASK=$id clinfo --list |grep "Graphics" || true)"
if [ "${xpu_detected}" != "" ] && [[ "${xpu_detected}" != *" UHD "* ]];then
xpu_list+="${id},"
fi
done
export ZE_AFFINITY_MASK=${xpu_list%,*}
numactl_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
split(z, xpu_list, ",");
if (length(xpu_list) > 1) {
for (i=0;i<length(xpu_list);i++) {
printf(" ZE_AFFINITY_MASK=%d OMP_NUM_THREADS=%d numactl -l -C %d-%d ;", xpu_list[i+1], cx, i*cx, (i+1)*cx-1);
}
}else {
printf(" numactl -l ");
}
}')"
pytest_extra_args="$(echo |awk -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
split(z, xpu_list, ",");
if (length(xpu_list) > 1) {
for (i=0;i<length(xpu_list);i++) {
ze = xpu_list[i+1];
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
ze, cx, i*cx, (i+1)*cx-1);
}
}else {
printf(" -n 1 ");
}
}')"
echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
echo "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" |tee -a ${GITHUB_OUTPUT}
echo "numactl_args=${numactl_args}" |tee -a ${GITHUB_OUTPUT}
echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}
- name: Cleanup host
shell: bash -xe {0}
run: |
# clean docker cache
docker system prune -af || true
# clean workspace
ls -al
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
# fix potential permission issue
sudo chown $(id -u):$(id -g) ${HOME} /tmp ./ -R
cd ${RUNNER_WORKSPACE}/..
if [ "${PWD}" != "/" ];then
ls -al
sudo chmod 777 -R . || true
sudo chown $(id -u):$(id -g) . -R || true
fi
5 changes: 4 additions & 1 deletion .github/actions/linux-e2etest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ runs:
return 1 # not match
fi
}
# Such as numactl_args=' ZE_AFFINITY_MASK=0 OMP_NUM_THREADS=12 numactl -l -C 0-11 ; ZE_AFFINITY_MASK=1 OMP_NUM_THREADS=12 numactl -l -C 12-23 ;'
delimiter=";"
IFS="${delimiter}" read -ra numactl_args_list <<< "${numactl_args}"
instances=${#numactl_args_list[@]}
run_with_shard() {
for i in $(seq 0 $[ ${xpu_num} - 1 ])
do
Expand All @@ -69,7 +73,6 @@ runs:
done
wait
}
export OMP_NUM_THREADS=${cpus_per_xpu}
for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
do
if [ "${suite}" == "pt2e" ];then
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/_linux_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ jobs:
run: |
cat /etc/os-release
hostname && id
# Cleanup workspace
find ./ |grep -v "^\./$" |xargs rm -rf
# install gh
dnf install -y 'dnf-command(config-manager)'
dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
Expand Down Expand Up @@ -219,4 +217,3 @@ jobs:
if: ${{ always() }}
run: |
chmod 777 /__w /github ./ -R
find ./ |grep -v "^\./$" |xargs rm -rf
9 changes: 2 additions & 7 deletions .github/workflows/_linux_e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,8 @@ jobs:
user_id: ${{ steps.runner-info.outputs.user_id }}
render_id: ${{ steps.runner-info.outputs.render_id }}
hostname: ${{ steps.runner-info.outputs.hostname }}
xpu_num: ${{ steps.runner-info.outputs.xpu_num }}
cpus_per_xpu: ${{ steps.runner-info.outputs.cpus_per_xpu }}
numactl_args: ${{ steps.runner-info.outputs.numactl_args }}
steps:
- name: Cleanup workspace
run: |
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Get runner
Expand All @@ -84,8 +80,7 @@ jobs:
options: --device=/dev/mem --device=/dev/dri --group-add video --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
-u ${{ needs.runner.outputs.user_id }}:${{ needs.runner.outputs.render_id }}
env:
xpu_num: ${{ needs.runner.outputs.xpu_num }}
cpus_per_xpu: ${{ needs.runner.outputs.cpus_per_xpu }}
numactl_args: ${{ needs.runner.outputs.numactl_args }}
MODEL_ONLY_NAME: ${{ inputs.model }}
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
steps:
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/_linux_op_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ jobs:
render_id: ${{ steps.runner-info.outputs.render_id }}
hostname: ${{ steps.runner-info.outputs.hostname }}
steps:
- name: Cleanup workspace
run: |
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Get runner
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ jobs:
hostname: ${{ steps.runner-info.outputs.hostname }}
pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
steps:
- name: Cleanup workspace
run: |
sudo find ./ |grep -v "^\./$" |xargs sudo rm -rf
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Get runner
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/_performance_comparison.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ jobs:
GH_TOKEN: ${{ github.token }}
runs-on: ubuntu-24.04
steps:
- name: Cleanup workspace
run: |
rm -rf ./target ./baseline
- name: Setup python
uses: actions/setup-python@v5
with:
Expand Down
Loading