Skip to content

Commit 1096b04

Browse files
committed
add distributed UT xdist
1 parent 0c66c2e commit 1096b04

File tree

3 files changed

+43
-53
lines changed

3 files changed

+43
-53
lines changed

.github/actions/get-runner/action.yml

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
name: Get Runner Infos
22

3+
inputs:
4+
ut_name:
5+
required: true
6+
type: string
7+
description: Which ut to launch
8+
39
outputs:
410
runner_id:
511
value: ${{ steps.runner.outputs.runner_id }}
@@ -57,22 +63,41 @@ runs:
5763
if(gpu==1 && $0~/Platform/){gpu=0}; if(gpu==1){print $0}; if($0~/Platform.*Graphics/){gpu=1}
5864
}' |wc -l)"
5965
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
60-
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
61-
if (x > 0) {
62-
split(z, xpu_list, ",");
63-
for (i=0;i<x;i++) {
64-
if (z != "") {
65-
ze = xpu_list[i+1];
66-
} else {
67-
ze = i;
66+
if [ "${{ inputs.ut_name }}" == "xpu_distributed" ];then
67+
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
68+
if (x > 0) {
69+
split(z, xpu_list, ",");
70+
for (i=0;i<x;i=i+4) {
71+
if (z != "") {
72+
ze = xpu_list[i+1];
73+
} else {
74+
ze = i;
75+
}
76+
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d,%d,%d,%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
77+
ze,ze+1,ze+2,ze+3,4*cx,i*cx,(i+4)*cx-1);
6878
}
69-
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
70-
ze, cx, i*cx, (i+1)*cx-1);
79+
}else {
80+
printf(" -n 1 ");
7181
}
72-
}else {
73-
printf(" -n 1 ");
74-
}
75-
}')"
82+
}')"
83+
else
84+
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
85+
if (x > 0) {
86+
split(z, xpu_list, ",");
87+
for (i=0;i<x;i++) {
88+
if (z != "") {
89+
ze = xpu_list[i+1];
90+
} else {
91+
ze = i;
92+
}
93+
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
94+
ze, cx, i*cx, (i+1)*cx-1);
95+
}
96+
}else {
97+
printf(" -n 1 ");
98+
}
99+
}')"
100+
fi
76101
echo "xpu_num=${xpu_num}" |tee -a ${GITHUB_OUTPUT}
77102
echo "cpus_per_xpu=${cpus_per_xpu}" |tee -a ${GITHUB_OUTPUT}
78103
echo "pytest_extra_args=${pytest_extra_args}" |tee -a ${GITHUB_OUTPUT}

.github/workflows/_linux_ut.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ env:
3939
jobs:
4040
runner:
4141
runs-on: ${{ inputs.runner }}
42-
name: get-runner
42+
name: get-runner
4343
outputs:
4444
runner_id: ${{ steps.runner-info.outputs.runner_id }}
4545
user_id: ${{ steps.runner-info.outputs.user_id }}
@@ -54,6 +54,8 @@ jobs:
5454
uses: actions/checkout@v4
5555
- name: Get runner
5656
id: runner-info
57+
with:
58+
ut_name: ${{ inputs.ut }}
5759
uses: ./.github/actions/get-runner
5860

5961
test-in-container:
@@ -105,7 +107,7 @@ jobs:
105107
runs-on: ${{ needs.runner.outputs.runner_id }}
106108
env:
107109
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
108-
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
110+
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread --dist worksteal ${{ needs.runner.outputs.pytest_extra_args }}
109111
steps:
110112
- name: Checkout torch-xpu-ops
111113
uses: actions/checkout@v4

test/xpu/run_distributed.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import os
21
import subprocess
32
import sys
43

@@ -9,42 +8,6 @@
98
res2 = 0
109
fail_test = []
1110

12-
# Get the xelink group card affinity
13-
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log")
14-
if ret == 0:
15-
gpu_dict = {}
16-
with open("topology.log") as file:
17-
lines = file.readlines()
18-
for line in lines:
19-
if "CPU Affinity" in line:
20-
continue
21-
line = line.strip()
22-
if line.startswith("GPU "):
23-
items = line.split(" ")
24-
items = [x for x in items if x]
25-
gpu_id = items[1]
26-
i = gpu_id.split("/")[0]
27-
affinity = ""
28-
for j, item in enumerate(items):
29-
if "SYS" not in item and ("XL" in item or "S" in item):
30-
if len(affinity) == 0:
31-
affinity = str(j - 2)
32-
else:
33-
affinity = affinity + "," + str(j - 2)
34-
gpu_dict[i] = affinity
35-
36-
max_affinity = ""
37-
for key, value in gpu_dict.items():
38-
if len(value) > len(max_affinity):
39-
max_affinity = value
40-
41-
os.environ["ZE_AFFINITY_MASK"] = str(max_affinity)
42-
print(str("ZE_AFFINITY_MASK=" + os.environ.get("ZE_AFFINITY_MASK")))
43-
44-
else:
45-
print("xpu-smi topology failed")
46-
sys.exit(255)
47-
4811

4912
# run python test
5013
def run(test_command):

0 commit comments

Comments
 (0)