Skip to content

Commit c51195d

Browse files
committed
kubelet/cm: fix bug where kubelet restarts from missing cpuset cgroup
on None cpumanager policy, cgroupv2, and systemd cgroup manager, kubelet could get into a situation where it believes the cpuset cgroup was created (by libcontainer in the cgroupfs) but systemd has deleted it, as it wasn't requested to create it. This causes one unnecessary restart, as kubelet fails with `failed to initialize top level QOS containers: root container [kubepods] doesn't exist.` This only causes one restart because the kubelet skips recreating the cgroup if it already exists, but it's still a bother and is fixed this way Signed-off-by: Peter Hunt <[email protected]>
1 parent 1b6c993 commit c51195d

File tree

4 files changed

+54
-12
lines changed

4 files changed

+54
-12
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,9 @@ func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainer
298298
if resourceConfig.PidsLimit != nil {
299299
resources.PidsLimit = *resourceConfig.PidsLimit
300300
}
301+
if !resourceConfig.CPUSet.IsEmpty() {
302+
resources.CpusetCpus = resourceConfig.CPUSet.String()
303+
}
301304

302305
m.maybeSetHugetlb(resourceConfig, resources)
303306

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/opencontainers/runc/libcontainer/configs"
3333
"k8s.io/klog/v2"
3434
"k8s.io/mount-utils"
35+
"k8s.io/utils/cpuset"
3536
utilpath "k8s.io/utils/path"
3637

3738
v1 "k8s.io/api/core/v1"
@@ -132,6 +133,10 @@ type containerManagerImpl struct {
132133
topologyManager topologymanager.Manager
133134
// Interface for Dynamic Resource Allocation management.
134135
draManager dra.Manager
136+
// The full set of CPUs on the node. This field is set lazily, and is used to make sure
137+
// the `cpuset` cgroup hierarchy is created on cgroup v2 when cpumanager is using a
138+
// None policy.
139+
allCPUs cpuset.CPUSet
135140
}
136141

137142
type features struct {

pkg/kubelet/cm/node_container_manager_linux.go

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,12 @@ import (
3232
utilfeature "k8s.io/apiserver/pkg/util/feature"
3333
"k8s.io/klog/v2"
3434
kubefeatures "k8s.io/kubernetes/pkg/features"
35+
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
36+
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
3537
"k8s.io/kubernetes/pkg/kubelet/events"
3638
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
3739
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
40+
"k8s.io/utils/cpuset"
3841
)
3942

4043
const (
@@ -53,7 +56,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5356
cgroupConfig := &CgroupConfig{
5457
Name: cm.cgroupRoot,
5558
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
56-
ResourceParameters: getCgroupConfig(nodeAllocatable, false),
59+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
5760
}
5861
if cm.cgroupManager.Exists(cgroupConfig.Name) {
5962
return nil
@@ -81,7 +84,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8184

8285
cgroupConfig := &CgroupConfig{
8386
Name: cm.cgroupRoot,
84-
ResourceParameters: getCgroupConfig(nodeAllocatable, false),
87+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
8588
}
8689

8790
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -110,7 +113,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
110113
// Now apply kube reserved and system reserved limits if required.
111114
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
112115
klog.V(2).InfoS("Enforcing system reserved on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
113-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.SystemReservedCgroupName), nc.SystemReserved, false); err != nil {
116+
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, false); err != nil {
114117
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
115118
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
116119
return errors.New(message)
@@ -119,7 +122,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
119122
}
120123
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
121124
klog.V(2).InfoS("Enforcing kube reserved on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
122-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.KubeReservedCgroupName), nc.KubeReserved, false); err != nil {
125+
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, false); err != nil {
123126
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
124127
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
125128
return errors.New(message)
@@ -129,7 +132,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
129132

130133
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedCompressibleEnforcementKey) {
131134
klog.V(2).InfoS("Enforcing system reserved compressible on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
132-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.SystemReservedCgroupName), nc.SystemReserved, true); err != nil {
135+
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, true); err != nil {
133136
message := fmt.Sprintf("Failed to enforce System Reserved Compressible Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
134137
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
135138
return errors.New(message)
@@ -139,7 +142,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
139142

140143
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedCompressibleEnforcementKey) {
141144
klog.V(2).InfoS("Enforcing kube reserved compressible on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
142-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.KubeReservedCgroupName), nc.KubeReserved, true); err != nil {
145+
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, true); err != nil {
143146
message := fmt.Sprintf("Failed to enforce Kube Reserved Compressible Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
144147
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
145148
return errors.New(message)
@@ -150,9 +153,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
150153
}
151154

152155
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
153-
func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList, compressibleResources bool) error {
154-
rp := getCgroupConfig(rl, compressibleResources)
155-
156+
func (cm *containerManagerImpl) enforceExistingCgroup(cNameStr string, rl v1.ResourceList, compressibleResources bool) error {
157+
cName := cm.cgroupManager.CgroupName(cNameStr)
158+
rp := cm.getCgroupConfig(rl, compressibleResources)
156159
if rp == nil {
157160
return fmt.Errorf("%q cgroup is not configured properly", cName)
158161
}
@@ -173,17 +176,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
173176
ResourceParameters: rp,
174177
}
175178
klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CPUShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit)
176-
if err := cgroupManager.Validate(cgroupConfig.Name); err != nil {
179+
if err := cm.cgroupManager.Validate(cgroupConfig.Name); err != nil {
177180
return err
178181
}
179-
if err := cgroupManager.Update(cgroupConfig); err != nil {
182+
if err := cm.cgroupManager.Update(cgroupConfig); err != nil {
180183
return err
181184
}
182185
return nil
183186
}
184187

185188
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
186-
func getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
189+
func (cm *containerManagerImpl) getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
187190
// TODO(vishh): Set CPU Quota if necessary.
188191
if rl == nil {
189192
return nil
@@ -217,9 +220,37 @@ func getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *Resour
217220
rc.HugePageLimit = HugePageLimits(rl)
218221
}
219222

223+
// In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
224+
// By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
225+
// However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
226+
// doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
227+
// An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
228+
// and this is sufficient.
229+
// Only do so on None policy, as Static policy will do its own updating of the cpuset.
230+
if cm.NodeConfig.CPUManagerPolicy == string(cpumanager.PolicyNone) {
231+
if cm.allCPUs.IsEmpty() {
232+
cm.allCPUs = cm.getAllCPUs()
233+
}
234+
rc.CPUSet = cm.allCPUs
235+
}
236+
220237
return &rc
221238
}
222239

240+
func (cm *containerManagerImpl) getAllCPUs() cpuset.CPUSet {
241+
machineInfo, err := cm.cadvisorInterface.MachineInfo()
242+
if err != nil {
243+
klog.V(4).InfoS("Failed to get machine info to get default cpuset", "error", err)
244+
return cpuset.CPUSet{}
245+
}
246+
topo, err := topology.Discover(machineInfo)
247+
if err != nil {
248+
klog.V(4).InfoS("Failed to get topology info to get default cpuset", "error", err)
249+
return cpuset.CPUSet{}
250+
}
251+
return topo.CPUDetails.CPUs()
252+
}
253+
223254
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
224255
// Note that not all resources that are available on the node are included in the returned list of resources.
225256
// Returns a ResourceList.

pkg/kubelet/cm/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ package cm
1919
import (
2020
v1 "k8s.io/api/core/v1"
2121
"k8s.io/apimachinery/pkg/types"
22+
"k8s.io/utils/cpuset"
2223
)
2324

2425
// ResourceConfig holds information about all the supported cgroup resource parameters.
2526
type ResourceConfig struct {
2627
// Memory limit (in bytes).
2728
Memory *int64
29+
// CPU set (number of CPUs the cgroup has access to).
30+
CPUSet cpuset.CPUSet
2831
// CPU shares (relative weight vs. other containers).
2932
CPUShares *uint64
3033
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.

0 commit comments

Comments
 (0)