Skip to content

Commit adc3ed2

Browse files
haircommanderatiratree
authored andcommitted
UPSTREAM: <carry>: disable load balancing on created cgroups when managed is enabled
Previously, cpu load balancing was enabled in cri-o by manually changing the sched_domain of cpus in sysfs. However, RHEL 9 dropped support for this knob, instead requiring it be changed in cgroups directly. To enable cpu load balancing on cgroupv1, the specified cgroup must have cpuset.sched_load_balance set to 0, as well as all of that cgroup's parents, plus all of the cgroups that contain a subset of the cpus that load balancing is disabled for. By default, all cpusets inherit the set from their parent and sched_load_balance as 1. Since we need to keep the cpus that need load balancing disabled in the root cgroup, all slices will inherit the full cpuset. Rather than rebalancing every cgroup whenever a new guaranteed cpuset cgroup is created, the approach this PR takes is to set load balancing to disabled for all slices. Since slices definitionally don't have any processes in them, setting load balancing won't affect the actual scheduling decisions of the kernel. All it will do is open the opportunity for CRI-O to set the actually set load balancing to disabled for containers that request it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: disable cpu load balancing on slices when using static cpu manager policy There are situations where cpu load balance disabling is desired when the kubelet is not in managed state. Instead of using that condition, set the cpu load balancing parameter for new slices when the cpu policy is static Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: cm: reorder setting of sched_load_balance for sandbox slice If we call mgr.Apply() first, libcontainer's cpusetCopyIfNeeded() will copy the parent cpuset and set load balancing to 1 by default. This causes the kernel to set the cpus to not load balanced for a brief moment which causes churn. instead, create the cgroup and set load balance, then have Apply() copy the values into it. Signed-off-by: Peter Hunt <[email protected]> UPSTREAM: <carry>: kubelet/cm: use MkdirAll when creating cpuset to ignore file exists error Signed-off-by: Peter Hunt <[email protected]>
1 parent 678af6d commit adc3ed2

File tree

6 files changed

+40
-1
lines changed

6 files changed

+40
-1
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"sync"
2727
"time"
2828

29+
"github.com/opencontainers/runc/libcontainer/cgroups"
2930
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
3031
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
3132
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
@@ -147,6 +148,10 @@ type cgroupCommon struct {
147148

148149
// useSystemd tells if systemd cgroup manager should be used.
149150
useSystemd bool
151+
152+
// cpuLoadBalanceDisable tells whether kubelet should disable
153+
// cpu load balancing on new cgroups it creates.
154+
cpuLoadBalanceDisable bool
150155
}
151156

152157
// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface
@@ -379,6 +384,25 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
379384
return err
380385
}
381386

387+
// Disable cpuset.sched_load_balance for all cgroups Kubelet creates.
388+
// This way, CRI can disable sched_load_balance for pods that must have load balance
389+
// disabled, but the slices can contain all cpus (as the guaranteed cpus are known dynamically).
390+
// Note: this should be done before Apply(-1) below, as Apply contains cpusetCopyIfNeeded(), which will
391+
// populate the cpuset with the parent's cpuset. However, it will be initialized to sched_load_balance=1
392+
// which will cause the kernel to move all cpusets out of their isolated sched_domain, causing unnecessary churn.
393+
if m.cpuLoadBalanceDisable && !libcontainercgroups.IsCgroup2UnifiedMode() {
394+
path := manager.Path("cpuset")
395+
if path == "" {
396+
return fmt.Errorf("Failed to find cpuset for newly created cgroup")
397+
}
398+
if err := os.MkdirAll(path, 0o755); err != nil {
399+
return fmt.Errorf("failed to create cpuset for newly created cgroup: %w", err)
400+
}
401+
if err := cgroups.WriteFile(path, "cpuset.sched_load_balance", "0"); err != nil {
402+
return err
403+
}
404+
}
405+
382406
// Apply(-1) is a hack to create the cgroup directories for each resource
383407
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
384408
// configuration to the process with the specified pid.
@@ -394,7 +418,6 @@ func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
394418
if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
395419
utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
396420
}
397-
398421
return nil
399422
}
400423

pkg/kubelet/cm/cgroup_manager_unsupported.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ func (m *unsupportedCgroupManager) SetCgroupConfig(name CgroupName, resource v1.
9393
return errNotSupported
9494
}
9595

96+
func (m *unsupportedCgroupManager) SetCPULoadBalanceDisable() {
97+
}
98+
9699
var RootCgroupName = CgroupName([]string{})
97100

98101
func NewCgroupName(base CgroupName, components ...string) CgroupName {

pkg/kubelet/cm/cgroup_v1_manager_linux.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,7 @@ func (c *cgroupV1impl) setCgroupMemoryConfig(cgroupPath string, resourceConfig *
189189
func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
190190
return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile)
191191
}
192+
193+
func (m *cgroupV1impl) SetCPULoadBalanceDisable() {
194+
m.cpuLoadBalanceDisable = true
195+
}

pkg/kubelet/cm/cgroup_v2_manager_linux.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,7 @@ func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
220220
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
221221
return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
222222
}
223+
224+
func (m *cgroupV2impl) SetCPULoadBalanceDisable() {
225+
m.cpuLoadBalanceDisable = true
226+
}

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,9 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
247247
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
248248
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
249249
nodeConfig.CgroupVersion = cgroupManager.Version()
250+
if nodeConfig.CPUManagerPolicy == string(cpumanager.PolicyStatic) {
251+
cgroupManager.SetCPULoadBalanceDisable()
252+
}
250253
// Check if Cgroup-root actually exists on the node
251254
if nodeConfig.CgroupsPerQOS {
252255
// this does default to / when enabled, but this tests against regressions.

pkg/kubelet/cm/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ type CgroupManager interface {
9090
SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error
9191
// Version of the cgroup implementation on the host
9292
Version() int
93+
// Toggle whether CPU load balancing should be disabled for new cgroups the kubelet creates
94+
SetCPULoadBalanceDisable()
9395
}
9496

9597
// QOSContainersInfo stores the names of containers per qos

0 commit comments

Comments
 (0)