@@ -32,9 +32,12 @@ import (
3232 utilfeature "k8s.io/apiserver/pkg/util/feature"
3333 "k8s.io/klog/v2"
3434 kubefeatures "k8s.io/kubernetes/pkg/features"
35+ "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
36+ "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
3537 "k8s.io/kubernetes/pkg/kubelet/events"
3638 "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
3739 kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
40+ "k8s.io/utils/cpuset"
3841)
3942
4043const (
@@ -53,7 +56,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5356 cgroupConfig := & CgroupConfig {
5457 Name : cm .cgroupRoot ,
5558 // The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
56- ResourceParameters : getCgroupConfig (nodeAllocatable , false ),
59+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable , false ),
5760 }
5861 if cm .cgroupManager .Exists (cgroupConfig .Name ) {
5962 return nil
@@ -81,7 +84,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8184
8285 cgroupConfig := & CgroupConfig {
8386 Name : cm .cgroupRoot ,
84- ResourceParameters : getCgroupConfig (nodeAllocatable , false ),
87+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable , false ),
8588 }
8689
8790 // Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -110,7 +113,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
110113 // Now apply kube reserved and system reserved limits if required.
111114 if nc .EnforceNodeAllocatable .Has (kubetypes .SystemReservedEnforcementKey ) {
112115 klog .V (2 ).InfoS ("Enforcing system reserved on cgroup" , "cgroupName" , nc .SystemReservedCgroupName , "limits" , nc .SystemReserved )
113- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .SystemReservedCgroupName ) , nc .SystemReserved , false ); err != nil {
116+ if err := cm .enforceExistingCgroup (nc .SystemReservedCgroupName , nc .SystemReserved , false ); err != nil {
114117 message := fmt .Sprintf ("Failed to enforce System Reserved Cgroup Limits on %q: %v" , nc .SystemReservedCgroupName , err )
115118 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
116119 return errors .New (message )
@@ -119,7 +122,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
119122 }
120123 if nc .EnforceNodeAllocatable .Has (kubetypes .KubeReservedEnforcementKey ) {
121124 klog .V (2 ).InfoS ("Enforcing kube reserved on cgroup" , "cgroupName" , nc .KubeReservedCgroupName , "limits" , nc .KubeReserved )
122- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .KubeReservedCgroupName ) , nc .KubeReserved , false ); err != nil {
125+ if err := cm .enforceExistingCgroup (nc .KubeReservedCgroupName , nc .KubeReserved , false ); err != nil {
123126 message := fmt .Sprintf ("Failed to enforce Kube Reserved Cgroup Limits on %q: %v" , nc .KubeReservedCgroupName , err )
124127 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
125128 return errors .New (message )
@@ -129,7 +132,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
129132
130133 if nc .EnforceNodeAllocatable .Has (kubetypes .SystemReservedCompressibleEnforcementKey ) {
131134 klog .V (2 ).InfoS ("Enforcing system reserved compressible on cgroup" , "cgroupName" , nc .SystemReservedCgroupName , "limits" , nc .SystemReserved )
132- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .SystemReservedCgroupName ) , nc .SystemReserved , true ); err != nil {
135+ if err := cm .enforceExistingCgroup (nc .SystemReservedCgroupName , nc .SystemReserved , true ); err != nil {
133136 message := fmt .Sprintf ("Failed to enforce System Reserved Compressible Cgroup Limits on %q: %v" , nc .SystemReservedCgroupName , err )
134137 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
135138 return errors .New (message )
@@ -139,7 +142,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
139142
140143 if nc .EnforceNodeAllocatable .Has (kubetypes .KubeReservedCompressibleEnforcementKey ) {
141144 klog .V (2 ).InfoS ("Enforcing kube reserved compressible on cgroup" , "cgroupName" , nc .KubeReservedCgroupName , "limits" , nc .KubeReserved )
142- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .KubeReservedCgroupName ) , nc .KubeReserved , true ); err != nil {
145+ if err := cm .enforceExistingCgroup (nc .KubeReservedCgroupName , nc .KubeReserved , true ); err != nil {
143146 message := fmt .Sprintf ("Failed to enforce Kube Reserved Compressible Cgroup Limits on %q: %v" , nc .KubeReservedCgroupName , err )
144147 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
145148 return errors .New (message )
@@ -150,9 +153,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
150153}
151154
152155// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
153- func enforceExistingCgroup ( cgroupManager CgroupManager , cName CgroupName , rl v1.ResourceList , compressibleResources bool ) error {
154- rp := getCgroupConfig ( rl , compressibleResources )
155-
156+ func ( cm * containerManagerImpl ) enforceExistingCgroup ( cNameStr string , rl v1.ResourceList , compressibleResources bool ) error {
157+ cName := cm . cgroupManager . CgroupName ( cNameStr )
158+ rp := cm . getCgroupConfig ( rl , compressibleResources )
156159 if rp == nil {
157160 return fmt .Errorf ("%q cgroup is not configured properly" , cName )
158161 }
@@ -173,17 +176,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
173176 ResourceParameters : rp ,
174177 }
175178 klog .V (4 ).InfoS ("Enforcing limits on cgroup" , "cgroupName" , cName , "cpuShares" , cgroupConfig .ResourceParameters .CPUShares , "memory" , cgroupConfig .ResourceParameters .Memory , "pidsLimit" , cgroupConfig .ResourceParameters .PidsLimit )
176- if err := cgroupManager .Validate (cgroupConfig .Name ); err != nil {
179+ if err := cm . cgroupManager .Validate (cgroupConfig .Name ); err != nil {
177180 return err
178181 }
179- if err := cgroupManager .Update (cgroupConfig ); err != nil {
182+ if err := cm . cgroupManager .Update (cgroupConfig ); err != nil {
180183 return err
181184 }
182185 return nil
183186}
184187
185188// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
186- func getCgroupConfig (rl v1.ResourceList , compressibleResourcesOnly bool ) * ResourceConfig {
189+ func ( cm * containerManagerImpl ) getCgroupConfig (rl v1.ResourceList , compressibleResourcesOnly bool ) * ResourceConfig {
187190 // TODO(vishh): Set CPU Quota if necessary.
188191 if rl == nil {
189192 return nil
@@ -217,9 +220,37 @@ func getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *Resour
217220 rc .HugePageLimit = HugePageLimits (rl )
218221 }
219222
223+ // In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
224+ // By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
225+ // However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
226+ // doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
227+ // An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
228+ // and this is sufficient.
229+ // Only do so on None policy, as Static policy will do its own updating of the cpuset.
230+ if cm .NodeConfig .CPUManagerPolicy == string (cpumanager .PolicyNone ) {
231+ if cm .allCPUs .IsEmpty () {
232+ cm .allCPUs = cm .getAllCPUs ()
233+ }
234+ rc .CPUSet = cm .allCPUs
235+ }
236+
220237 return & rc
221238}
222239
240+ func (cm * containerManagerImpl ) getAllCPUs () cpuset.CPUSet {
241+ machineInfo , err := cm .cadvisorInterface .MachineInfo ()
242+ if err != nil {
243+ klog .V (4 ).InfoS ("Failed to get machine info to get default cpuset" , "error" , err )
244+ return cpuset.CPUSet {}
245+ }
246+ topo , err := topology .Discover (machineInfo )
247+ if err != nil {
248+ klog .V (4 ).InfoS ("Failed to get topology info to get default cpuset" , "error" , err )
249+ return cpuset.CPUSet {}
250+ }
251+ return topo .CPUDetails .CPUs ()
252+ }
253+
223254// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
224255// Note that not all resources that are available on the node are included in the returned list of resources.
225256// Returns a ResourceList.
0 commit comments