Skip to content

Commit 8bf029f

Browse files
authored
Merge pull request #18 from CLIP-HPC/16-gpu-heterogeneity-issues
fix the heterogenous gpu issue
2 parents cab3e31 + 22e5714 commit 8bf029f

File tree

5 files changed

+147
-15
lines changed

5 files changed

+147
-15
lines changed

internal/model/tabs/clustertab/clustertab.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ import (
44
"log"
55
"strings"
66

7-
"github.com/charmbracelet/bubbles/progress"
8-
"github.com/charmbracelet/bubbles/textinput"
97
"github.com/CLIP-HPC/SlurmCommander/internal/generic"
108
"github.com/CLIP-HPC/SlurmCommander/internal/slurm"
119
"github.com/CLIP-HPC/SlurmCommander/internal/table"
10+
"github.com/charmbracelet/bubbles/progress"
11+
"github.com/charmbracelet/bubbles/textinput"
1212
)
1313

1414
type ClusterTab struct {
@@ -89,6 +89,7 @@ func (t *ClusterTab) GetStatsFiltered(l *log.Logger) {
8989
mpp[p].Name = p
9090
mpp[p].Count += uint(*v.AllocMemory)
9191
mpp[p].Total += uint(*v.RealMemory)
92+
9293
gpp[p].Name = p
9394
gpp[p].Count += uint(*slurm.ParseGRES(*v.GresUsed))
9495
gpp[p].Total += uint(*slurm.ParseGRES(*v.Gres))

internal/model/tabs/clustertab/clustertabview.go

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package clustertab
33
import (
44
"fmt"
55
"log"
6+
"sort"
67
"strings"
78

89
"github.com/CLIP-HPC/SlurmCommander/internal/generic"
@@ -29,9 +30,12 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string {
2930
memPerc float64 = 0
3031
memUsed int64 = 0
3132
memAvail int = 0
32-
gpuPerc float64 = 0
33-
gpuUsed int = 0
34-
gpuAvail int = 0
33+
//gpuPerc float64 = 0
34+
gpuUsed slurm.GresMap = make(slurm.GresMap)
35+
gpuAvail slurm.GresMap = make(slurm.GresMap)
36+
gpuPerc map[string]float64 = make(map[string]float64)
37+
gpuList string
38+
gpuSlice []string = make([]string, 0)
3539
)
3640

3741
sel := ct.SinfoTable.Cursor()
@@ -47,18 +51,31 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string {
4751
memUsed = *ct.SinfoFiltered.Nodes[sel].AllocMemory
4852
memAvail = *ct.SinfoFiltered.Nodes[sel].RealMemory
4953
memPerc = float64(memUsed) / float64(memAvail)
50-
gpuAvail = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].Gres)
51-
gpuUsed = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].GresUsed)
52-
if gpuAvail > 0 {
53-
gpuPerc = float64(gpuUsed) / float64(gpuAvail)
54+
55+
gpuAvail = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].Gres)
56+
gpuUsed = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].GresUsed)
57+
if len(gpuAvail) > 0 {
58+
for k, _ := range gpuAvail {
59+
gpuPerc[k] = float64(gpuUsed[k]) / float64(gpuAvail[k])
60+
}
5461
}
5562
}
5663
cpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("CPU used/total: %d/%d", cpuUsed, cpuAvail), ct.CpuBar.ViewAs(cpuPerc))
5764
memr := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("MEM used/total: %d/%d", memUsed, memAvail), ct.MemBar.ViewAs(memPerc))
5865
scr += lipgloss.JoinVertical(lipgloss.Top, cpur, memr)
59-
if gpuAvail > 0 {
60-
gpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("GPU used/total: %d/%d", gpuUsed, gpuAvail), ct.GpuBar.ViewAs(gpuPerc))
61-
scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpur)
66+
67+
for k := range gpuAvail {
68+
gpuSlice = append(gpuSlice, k)
69+
}
70+
sort.Strings(gpuSlice)
71+
72+
if len(gpuAvail) > 0 {
73+
for _, k := range gpuSlice {
74+
// TODO: this adds one additional newline at the top bringing gpus down... find the fix
75+
//gpuList = lipgloss.JoinVertical(lipgloss.Left, gpuList, fmt.Sprintf("GPU %s used/total: %d/%d", k, gpuUsed[k], gpuAvail[k]), ct.GpuBar.ViewAs(gpuPerc[k]))
76+
gpuList += fmt.Sprintf("GPU %q used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n"
77+
}
78+
scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpuList[:len(gpuList)-1])
6279
}
6380
scr += "\n\n"
6481
return scr

internal/model/view.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,16 @@ func (m Model) tabAbout() string {
4646
s += "Commit : " + version.BuildCommit + "\n"
4747

4848
s += `
49-
Petar Jager
5049
51-
A special thank you goes to:
50+
A special thank you goes to our code-crafters, bug-hunters, idea-pitchers:
51+
(in order of appearance)
5252
53+
Petar Jager
5354
Seren Ümit
5455
Kilian Cavalotti
5556
Killian Murphy
5657
Hans-Nikolai Vießmann
58+
github.com/reedacus25
5759
`
5860

5961
return s

internal/slurm/sinfo.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,31 @@ func ParseGRES(line string) *int {
2525

2626
matches := gpuGresPattern.FindStringSubmatch(g)
2727
if len(matches) == 3 {
28-
value, _ = strconv.Atoi(matches[2])
28+
v, _ := strconv.Atoi(matches[2])
29+
value += v
2930
}
3031
}
3132

3233
return &value
3334
}
35+
36+
type GresMap map[string]int
37+
38+
func ParseGRESAll(line string) *GresMap {
39+
var gmap GresMap = make(GresMap)
40+
41+
gres := strings.Split(line, ",")
42+
for _, g := range gres {
43+
if !strings.HasPrefix(g, "gpu:") {
44+
continue
45+
}
46+
47+
matches := gpuGresPattern.FindStringSubmatch(g)
48+
if len(matches) == 3 {
49+
v, _ := strconv.Atoi(matches[2])
50+
gmap[strings.Trim(matches[1], ":")] += v
51+
}
52+
}
53+
54+
return &gmap
55+
}

internal/slurm/sinfo_test.go

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package slurm_test
2+
3+
import (
4+
"reflect"
5+
"testing"
6+
7+
"github.com/CLIP-HPC/SlurmCommander/internal/slurm"
8+
)
9+
10+
type gresTest []struct {
11+
testName string
12+
input string
13+
expect int
14+
expectMap slurm.GresMap
15+
}
16+
17+
var (
18+
gresTestTable = gresTest{
19+
{
20+
testName: "GRES-empty",
21+
input: "",
22+
expect: 0,
23+
expectMap: slurm.GresMap{},
24+
},
25+
{
26+
testName: "GRES-junk: asdf123:123:123:123",
27+
input: "asdf123:123:123:123",
28+
expect: 0,
29+
expectMap: slurm.GresMap{},
30+
},
31+
{
32+
testName: "GRES-simple: gpu:8(S:0-1)",
33+
input: "gpu:8(S:0-1)",
34+
expect: 8,
35+
expectMap: slurm.GresMap{"": 8},
36+
},
37+
{
38+
testName: "GRES: gpu:P100:8(S:0-1)",
39+
input: "gpu:P100:8(S:0-1)",
40+
expect: 8,
41+
expectMap: slurm.GresMap{"P100": 8},
42+
},
43+
{
44+
testName: "GRES_USED: gpu:P100:2(IDX:3,7)",
45+
input: "gpu:P100:2(IDX:3,7)",
46+
expect: 2,
47+
expectMap: slurm.GresMap{"P100": 2},
48+
},
49+
{
50+
testName: "GRES: gpu:p100:6(S:0),gpu:rtx:2(S:0)",
51+
input: "gpu:p100:6(S:0),gpu:rtx:2(S:0)",
52+
expect: 8,
53+
expectMap: slurm.GresMap{"p100": 6, "rtx": 2},
54+
},
55+
{
56+
testName: "GRES_USED: gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)",
57+
input: "gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)",
58+
expect: 0,
59+
expectMap: slurm.GresMap{"p100": 0, "rtx": 0},
60+
},
61+
{
62+
testName: "GRES_USED: gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)",
63+
input: "gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)",
64+
expect: 3,
65+
expectMap: slurm.GresMap{"p100": 2, "rtx": 1},
66+
},
67+
}
68+
)
69+
70+
func TestParseGRES(t *testing.T) {
71+
for i, v := range gresTestTable {
72+
t.Logf("Running test %d : %q\n", i, v.testName)
73+
rez := *slurm.ParseGRES(v.input)
74+
t.Logf("Expect: %d Got: %d\n", v.expect, rez)
75+
if rez != v.expect {
76+
t.Fatal("FAILED !!!")
77+
}
78+
}
79+
}
80+
81+
func TestParseGRESAll(t *testing.T) {
82+
for i, v := range gresTestTable {
83+
t.Logf("Running test %d : %q\n", i, v.testName)
84+
rez := *slurm.ParseGRESAll(v.input)
85+
t.Logf("Expect: %#v Got: %#v\n", v.expectMap, rez)
86+
if !reflect.DeepEqual(rez, v.expectMap) {
87+
t.Fatal("FAILED !!!")
88+
}
89+
}
90+
}

0 commit comments

Comments
 (0)