@@ -31,9 +31,12 @@ import (
3131 utilfeature "k8s.io/apiserver/pkg/util/feature"
3232 "k8s.io/klog/v2"
3333 kubefeatures "k8s.io/kubernetes/pkg/features"
34+ "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
35+ "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
3436 "k8s.io/kubernetes/pkg/kubelet/events"
3537 "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
3638 kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
39+ "k8s.io/utils/cpuset"
3740)
3841
3942const (
@@ -52,7 +55,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5255 cgroupConfig := & CgroupConfig {
5356 Name : cm .cgroupRoot ,
5457 // The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
55- ResourceParameters : getCgroupConfig (nodeAllocatable ),
58+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable ),
5659 }
5760 if cm .cgroupManager .Exists (cgroupConfig .Name ) {
5861 return nil
@@ -80,7 +83,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8083
8184 cgroupConfig := & CgroupConfig {
8285 Name : cm .cgroupRoot ,
83- ResourceParameters : getCgroupConfig (nodeAllocatable ),
86+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable ),
8487 }
8588
8689 // Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -114,7 +117,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
114117 // Now apply kube reserved and system reserved limits if required.
115118 if nc .EnforceNodeAllocatable .Has (kubetypes .SystemReservedEnforcementKey ) {
116119 klog .V (2 ).InfoS ("Enforcing system reserved on cgroup" , "cgroupName" , nc .SystemReservedCgroupName , "limits" , nc .SystemReserved )
117- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .SystemReservedCgroupName ) , nc .SystemReserved ); err != nil {
120+ if err := cm .enforceExistingCgroup (nc .SystemReservedCgroupName , nc .SystemReserved ); err != nil {
118121 message := fmt .Sprintf ("Failed to enforce System Reserved Cgroup Limits on %q: %v" , nc .SystemReservedCgroupName , err )
119122 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
120123 return fmt .Errorf (message )
@@ -123,7 +126,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
123126 }
124127 if nc .EnforceNodeAllocatable .Has (kubetypes .KubeReservedEnforcementKey ) {
125128 klog .V (2 ).InfoS ("Enforcing kube reserved on cgroup" , "cgroupName" , nc .KubeReservedCgroupName , "limits" , nc .KubeReserved )
126- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .KubeReservedCgroupName ) , nc .KubeReserved ); err != nil {
129+ if err := cm .enforceExistingCgroup (nc .KubeReservedCgroupName , nc .KubeReserved ); err != nil {
127130 message := fmt .Sprintf ("Failed to enforce Kube Reserved Cgroup Limits on %q: %v" , nc .KubeReservedCgroupName , err )
128131 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
129132 return fmt .Errorf (message )
@@ -134,8 +137,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
134137}
135138
136139// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
137- func enforceExistingCgroup (cgroupManager CgroupManager , cName CgroupName , rl v1.ResourceList ) error {
138- rp := getCgroupConfig (rl )
140+ func (cm * containerManagerImpl ) enforceExistingCgroup (cNameStr string , rl v1.ResourceList ) error {
141+ cName := cm .cgroupManager .CgroupName (cNameStr )
142+ rp := cm .getCgroupConfig (rl )
139143 if rp == nil {
140144 return fmt .Errorf ("%q cgroup is not configured properly" , cName )
141145 }
@@ -156,17 +160,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
156160 ResourceParameters : rp ,
157161 }
158162 klog .V (4 ).InfoS ("Enforcing limits on cgroup" , "cgroupName" , cName , "cpuShares" , cgroupConfig .ResourceParameters .CPUShares , "memory" , cgroupConfig .ResourceParameters .Memory , "pidsLimit" , cgroupConfig .ResourceParameters .PidsLimit )
159- if err := cgroupManager .Validate (cgroupConfig .Name ); err != nil {
163+ if err := cm . cgroupManager .Validate (cgroupConfig .Name ); err != nil {
160164 return err
161165 }
162- if err := cgroupManager .Update (cgroupConfig ); err != nil {
166+ if err := cm . cgroupManager .Update (cgroupConfig ); err != nil {
163167 return err
164168 }
165169 return nil
166170}
167171
168172// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
169- func getCgroupConfig (rl v1.ResourceList ) * ResourceConfig {
173+ func ( cm * containerManagerImpl ) getCgroupConfig (rl v1.ResourceList ) * ResourceConfig {
170174 // TODO(vishh): Set CPU Quota if necessary.
171175 if rl == nil {
172176 return nil
@@ -188,9 +192,37 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
188192 }
189193 rc .HugePageLimit = HugePageLimits (rl )
190194
195+ // In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
196+ // By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
197+ // However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
198+ // doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
199+ // An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
200+ // and this is sufficient.
201+ // Only do so on None policy, as Static policy will do its own updating of the cpuset.
202+ if cm .NodeConfig .CPUManagerPolicy == string (cpumanager .PolicyNone ) {
203+ if cm .allCPUs .IsEmpty () {
204+ cm .allCPUs = cm .getAllCPUs ()
205+ }
206+ rc .CPUSet = cm .allCPUs
207+ }
208+
191209 return & rc
192210}
193211
212+ func (cm * containerManagerImpl ) getAllCPUs () cpuset.CPUSet {
213+ machineInfo , err := cm .cadvisorInterface .MachineInfo ()
214+ if err != nil {
215+ klog .V (4 ).InfoS ("Failed to get machine info to get default cpuset" , "error" , err )
216+ return cpuset.CPUSet {}
217+ }
218+ topo , err := topology .Discover (machineInfo )
219+ if err != nil {
220+ klog .V (4 ).InfoS ("Failed to get topology info to get default cpuset" , "error" , err )
221+ return cpuset.CPUSet {}
222+ }
223+ return topo .CPUDetails .CPUs ()
224+ }
225+
194226// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
195227// Note that not all resources that are available on the node are included in the returned list of resources.
196228// Returns a ResourceList.
0 commit comments