Skip to content

Commit bd63535

Browse files
committed
detect topology directly for xdist
1 parent 1096b04 commit bd63535

File tree

5 files changed

+71
-37
lines changed

5 files changed

+71
-37
lines changed

.github/actions/get-runner/action.yml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Get Runner Infos
22

33
inputs:
44
ut_name:
5-
required: true
5+
required: false
66
type: string
77
description: Which ut to launch
88

@@ -64,22 +64,7 @@ runs:
6464
}' |wc -l)"
6565
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
6666
if [ "${{ inputs.ut_name }}" == "xpu_distributed" ];then
67-
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
68-
if (x > 0) {
69-
split(z, xpu_list, ",");
70-
for (i=0;i<x;i=i+4) {
71-
if (z != "") {
72-
ze = xpu_list[i+1];
73-
} else {
74-
ze = i;
75-
}
76-
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d,%d,%d,%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
77-
ze,ze+1,ze+2,ze+3,4*cx,i*cx,(i+4)*cx-1);
78-
}
79-
}else {
80-
printf(" -n 1 ");
81-
}
82-
}')"
67+
pytest_extra_args="$(python ${{ github.workspace }}/.github/scripts/check-topology.py)"
8368
else
8469
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
8570
if (x > 0) {

.github/scripts/check-topology.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
import sys
3+
4+
# Get the xelink group card affinity
5+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log > /dev/null")
6+
if ret == 0:
7+
gpu_dict = {}
8+
cpu_dict = {}
9+
with open("topology.log") as file:
10+
lines = file.readlines()
11+
for line in lines:
12+
if "CPU Affinity" in line:
13+
continue
14+
line = line.strip()
15+
if line.startswith("GPU "):
16+
items = line.split(" ")
17+
items = [x for x in items if x]
18+
gpu_id = items[1]
19+
cpu_affinity = items[-1].split(",")[0]
20+
i = gpu_id.split("/")[0]
21+
affinity = ""
22+
for j, item in enumerate(items):
23+
if "SYS" not in item and ("XL" in item or "S" in item):
24+
if len(affinity) == 0:
25+
affinity = str(j - 2)
26+
else:
27+
affinity = affinity + "," + str(j - 2)
28+
gpu_dict[i] = affinity
29+
cpu_dict[i] = cpu_affinity
30+
31+
value_to_keys = {}
32+
gpu_cpu_dict = {}
33+
for key, value in gpu_dict.items():
34+
if value not in value_to_keys:
35+
value_to_keys[value] = []
36+
value_to_keys[value].append(key)
37+
dist_group = []
38+
for key, value in value_to_keys.items():
39+
if key == ','.join(value_to_keys[key]):
40+
dist_group.append(key)
41+
for group in dist_group:
42+
cpu_aff = []
43+
for i in group.split(","):
44+
if cpu_dict[i] not in cpu_aff:
45+
cpu_aff.append(cpu_dict[i])
46+
if len(cpu_aff) == 1:
47+
gpu_cpu_dict[group] = ','.join(cpu_aff)
48+
if len(gpu_cpu_dict) == 0:
49+
print("No Xelink detected")
50+
sys.exit(255)
51+
pytest_extra_args = ""
52+
for key, value in gpu_cpu_dict.items():
53+
start_cpu = int(value.split("-")[0])
54+
end_cpu = int(value.split("-")[1])
55+
threads = end_cpu - start_cpu + 1
56+
pytest_extra_args = pytest_extra_args + \
57+
' --tx popen//env:ZE_AFFINITY_MASK=%s//env:OMP_NUM_THREADS=%d//python="numactl -l -C %s python"'\
58+
%(key, threads, value)
59+
print(pytest_extra_args)
60+
61+
else:
62+
print("xpu-smi topology failed")
63+
64+
sys.exit(255)

.github/scripts/ut_result_check.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ fi
199199

200200
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
201201
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $3}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
202-
sed -i '/^[^.]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
202+
grep -E "^FAILED" xpu_distributed_test.log | awk '{print $2}' >> ./"${ut_suite}"_xpu_distributed_test_failed.log
203+
sed -i '/^[^.d]\+/d' ./"${ut_suite}"_xpu_distributed_test_failed.log
203204
grep "PASSED" xpu_distributed_test.log | awk '{print $1}' > ./"${ut_suite}"_xpu_distributed_test_passed.log
204205
echo -e "========================================================================="
205206
echo -e "Show Failed cases in ${ut_suite} xpu distributed"
@@ -235,7 +236,8 @@ if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
235236
num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
236237
fi
237238
((num_failed=num_failed_xpu_distributed))
238-
if [[ $num_failed -gt 0 ]]; then
239+
num_passed=$(wc -l < "./${ut_suite}_xpu_distributed_test_passed.log")
240+
if [[ $num_failed -gt 0 ]] || [[ $num_passed -eq 0 ]]; then
239241
echo -e "[ERROR] UT ${ut_suite} test Fail"
240242
exit 1
241243
else

.github/workflows/_linux_ut.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ env:
3939
jobs:
4040
runner:
4141
runs-on: ${{ inputs.runner }}
42-
name: get-runner
42+
name: get-runner
4343
outputs:
4444
runner_id: ${{ steps.runner-info.outputs.runner_id }}
4545
user_id: ${{ steps.runner-info.outputs.user_id }}

test/xpu/skip_list_dist.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,34 +3,20 @@
33
"../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
44
"../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
55
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
6-
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
76
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
87
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
98
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
109
"test_transformer_no_grad_mixed_precision_True_xpu",
1110
),
12-
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
1311
"../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
1412
"../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
1513
"../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
16-
"../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
17-
"../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
1814
"../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
1915
"../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
20-
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
21-
"../../../../test/distributed/fsdp/test_fsdp_overlap.py": None,
2216
"../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
2317
"../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
2418
"../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
25-
"../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
26-
"../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
27-
"../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
2819
"../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
29-
"../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
30-
"test_diff_hyperparams_sharding_strategy_str_full_shard",
31-
"test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
32-
),
33-
"../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
3420
"../../../../test/distributed/fsdp/test_shard_utils.py": None,
3521
"../../../../test/distributed/fsdp/test_utils.py": None,
3622
"../../../../test/distributed/fsdp/test_wrap.py": None,
@@ -67,9 +53,6 @@
6753
"test_train_parity_shard_placement_fn_shard_largest_dim",
6854
"test_3d_mlp_with_nd_mesh",
6955
),
70-
"../../../../test/distributed/_composable/test_composability/test_2d_composability.py": (
71-
"test_tp_with_fsdp_offloading",
72-
),
7356
"../../../../test/distributed/_composable/test_replicate_with_compiler.py": None,
7457
"../../../../test/distributed/_composable/test_composability/test_pp_composability.py": None,
7558
"../../../../test/distributed/_composable/test_checkpoint.py": None,

0 commit comments

Comments
 (0)