Skip to content

Commit 433c6c3

Browse files
committed
UPSTREAM: <carry>: kubelet/cm: fix bug where kubelet restarts from missing cpuset cgroup
on None cpumanager policy, cgroupv2, and systemd cgroup manager, kubelet could get into a situation where it believes the cpuset cgroup was created (by libcontainer in the cgroupfs) but systemd has deleted it, as it wasn't requested to create it. This causes one unnecessary restart, as kubelet fails with `failed to initialize top level QOS containers: root container [kubepods] doesn't exist.` This only causes one restart because the kubelet skips recreating the cgroup if it already exists, but it's still a bother and is fixed this way adapted version of kubernetes@c51195d, avoid merge conflict this can be dropped in the 4.19 rebase Signed-off-by: Peter Hunt <[email protected]>
1 parent 03a907c commit 433c6c3

File tree

4 files changed

+52
-9
lines changed

4 files changed

+52
-9
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,9 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
383383
if resourceConfig.PidsLimit != nil {
384384
resources.PidsLimit = *resourceConfig.PidsLimit
385385
}
386+
if !resourceConfig.CPUSet.IsEmpty() {
387+
resources.CpusetCpus = resourceConfig.CPUSet.String()
388+
}
386389

387390
m.maybeSetHugetlb(resourceConfig, resources)
388391

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/opencontainers/runc/libcontainer/configs"
3535
"k8s.io/klog/v2"
3636
"k8s.io/mount-utils"
37+
"k8s.io/utils/cpuset"
3738
utilpath "k8s.io/utils/path"
3839

3940
libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns"
@@ -131,6 +132,10 @@ type containerManagerImpl struct {
131132
topologyManager topologymanager.Manager
132133
// Interface for Dynamic Resource Allocation management.
133134
draManager dra.Manager
135+
// The full set of CPUs on the node. This field is set lazily, and is used to make sure
136+
// the `cpuset` cgroup hierarchy is created on cgroup v2 when cpumanager is using a
137+
// None policy.
138+
allCPUs cpuset.CPUSet
134139
}
135140

136141
type features struct {

pkg/kubelet/cm/node_container_manager_linux.go

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ import (
3131
utilfeature "k8s.io/apiserver/pkg/util/feature"
3232
"k8s.io/klog/v2"
3333
kubefeatures "k8s.io/kubernetes/pkg/features"
34+
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
35+
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
3436
"k8s.io/kubernetes/pkg/kubelet/events"
3537
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
3638
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
39+
"k8s.io/utils/cpuset"
3740
)
3841

3942
const (
@@ -52,7 +55,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5255
cgroupConfig := &CgroupConfig{
5356
Name: cm.cgroupRoot,
5457
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
55-
ResourceParameters: getCgroupConfig(nodeAllocatable),
58+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable),
5659
}
5760
if cm.cgroupManager.Exists(cgroupConfig.Name) {
5861
return nil
@@ -80,7 +83,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8083

8184
cgroupConfig := &CgroupConfig{
8285
Name: cm.cgroupRoot,
83-
ResourceParameters: getCgroupConfig(nodeAllocatable),
86+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable),
8487
}
8588

8689
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -114,7 +117,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
114117
// Now apply kube reserved and system reserved limits if required.
115118
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
116119
klog.V(2).InfoS("Enforcing system reserved on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
117-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.SystemReservedCgroupName), nc.SystemReserved); err != nil {
120+
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved); err != nil {
118121
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
119122
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
120123
return fmt.Errorf(message)
@@ -123,7 +126,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
123126
}
124127
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
125128
klog.V(2).InfoS("Enforcing kube reserved on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
126-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.KubeReservedCgroupName), nc.KubeReserved); err != nil {
129+
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved); err != nil {
127130
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
128131
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
129132
return fmt.Errorf(message)
@@ -134,8 +137,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
134137
}
135138

136139
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
137-
func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error {
138-
rp := getCgroupConfig(rl)
140+
func (cm *containerManagerImpl) enforceExistingCgroup(cNameStr string, rl v1.ResourceList) error {
141+
cName := cm.cgroupManager.CgroupName(cNameStr)
142+
rp := cm.getCgroupConfig(rl)
139143
if rp == nil {
140144
return fmt.Errorf("%q cgroup is not configured properly", cName)
141145
}
@@ -156,17 +160,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
156160
ResourceParameters: rp,
157161
}
158162
klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CPUShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit)
159-
if err := cgroupManager.Validate(cgroupConfig.Name); err != nil {
163+
if err := cm.cgroupManager.Validate(cgroupConfig.Name); err != nil {
160164
return err
161165
}
162-
if err := cgroupManager.Update(cgroupConfig); err != nil {
166+
if err := cm.cgroupManager.Update(cgroupConfig); err != nil {
163167
return err
164168
}
165169
return nil
166170
}
167171

168172
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
169-
func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
173+
func (cm *containerManagerImpl) getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
170174
// TODO(vishh): Set CPU Quota if necessary.
171175
if rl == nil {
172176
return nil
@@ -188,9 +192,37 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
188192
}
189193
rc.HugePageLimit = HugePageLimits(rl)
190194

195+
// In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
196+
// By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
197+
// However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
198+
// doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
199+
// An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
200+
// and this is sufficient.
201+
// Only do so on None policy, as Static policy will do its own updating of the cpuset.
202+
if cm.NodeConfig.CPUManagerPolicy == string(cpumanager.PolicyNone) {
203+
if cm.allCPUs.IsEmpty() {
204+
cm.allCPUs = cm.getAllCPUs()
205+
}
206+
rc.CPUSet = cm.allCPUs
207+
}
208+
191209
return &rc
192210
}
193211

212+
func (cm *containerManagerImpl) getAllCPUs() cpuset.CPUSet {
213+
machineInfo, err := cm.cadvisorInterface.MachineInfo()
214+
if err != nil {
215+
klog.V(4).InfoS("Failed to get machine info to get default cpuset", "error", err)
216+
return cpuset.CPUSet{}
217+
}
218+
topo, err := topology.Discover(machineInfo)
219+
if err != nil {
220+
klog.V(4).InfoS("Failed to get topology info to get default cpuset", "error", err)
221+
return cpuset.CPUSet{}
222+
}
223+
return topo.CPUDetails.CPUs()
224+
}
225+
194226
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
195227
// Note that not all resources that are available on the node are included in the returned list of resources.
196228
// Returns a ResourceList.

pkg/kubelet/cm/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ package cm
1919
import (
2020
v1 "k8s.io/api/core/v1"
2121
"k8s.io/apimachinery/pkg/types"
22+
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
2223
)
2324

2425
// ResourceConfig holds information about all the supported cgroup resource parameters.
2526
type ResourceConfig struct {
2627
// Memory limit (in bytes).
2728
Memory *int64
29+
// CPU set (number of cpus the cgroup has access to).
30+
CPUSet cpuset.CPUSet
2831
// CPU shares (relative weight vs. other containers).
2932
CPUShares *uint64
3033
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.

0 commit comments

Comments
 (0)