@@ -20,6 +20,9 @@ limitations under the License.
2020package kuberuntime
2121
2222import (
23+ "fmt"
24+ cadvisorv1 "github.com/google/cadvisor/info/v1"
25+ kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2326 "math"
2427 "os"
2528 "strconv"
@@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
4649 enforceMemoryQoS := false
4750 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
4851 if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .MemoryQoS ) &&
49- libcontainercgroups . IsCgroup2UnifiedMode () {
52+ isCgroup2UnifiedMode () {
5053 enforceMemoryQoS = true
5154 }
5255 cl , err := m .generateLinuxContainerConfig (container , pod , uid , username , nsTarget , enforceMemoryQoS )
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
99102
100103 lcr .HugepageLimits = GetHugepageLimitsFromResources (container .Resources )
101104
102- if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .NodeSwap ) {
105+ if swapConfigurationHelper := newSwapConfigurationHelper ( * m . machineInfo ); utilfeature .DefaultFeatureGate .Enabled (kubefeatures .NodeSwap ) {
103106 // NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
104107 // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
105108 switch m .memorySwapBehavior {
106- case kubelettypes .UnlimitedSwap :
107- // -1 = unlimited swap
108- lcr .MemorySwapLimitInBytes = - 1
109109 case kubelettypes .LimitedSwap :
110- fallthrough
110+ swapConfigurationHelper . ConfigureLimitedSwap ( lcr , pod , container )
111111 default :
112- // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
113- // Some swapping is still possible.
114- // Note that if memory limit is 0, memory swap limit is ignored.
115- lcr .MemorySwapLimitInBytes = lcr .MemoryLimitInBytes
112+ swapConfigurationHelper .ConfigureUnlimitedSwap (lcr )
116113 }
114+ } else {
115+ swapConfigurationHelper .ConfigureNoSwap (lcr )
117116 }
118117
119118 // Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
122121 memoryRequest := container .Resources .Requests .Memory ().Value ()
123122 memoryLimit := container .Resources .Limits .Memory ().Value ()
124123 if memoryRequest != 0 {
125- unified [cm .MemoryMin ] = strconv .FormatInt (memoryRequest , 10 )
124+ unified [cm .Cgroup2MemoryMin ] = strconv .FormatInt (memoryRequest , 10 )
126125 }
127126
128127 // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
148147 }
149148 }
150149 if memoryHigh != 0 && memoryHigh > memoryRequest {
151- unified [cm .MemoryHigh ] = strconv .FormatInt (memoryHigh , 10 )
150+ unified [cm .Cgroup2MemoryHigh ] = strconv .FormatInt (memoryHigh , 10 )
152151 }
153152 }
154153 if len (unified ) > 0 {
@@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
171170 enforceMemoryQoS := false
172171 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
173172 if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .MemoryQoS ) &&
174- libcontainercgroups . IsCgroup2UnifiedMode () {
173+ isCgroup2UnifiedMode () {
175174 enforceMemoryQoS = true
176175 }
177176 return & runtimeapi.ContainerResources {
@@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
216215 }
217216
218217 // runc requires cgroupv2 for unified mode
219- if libcontainercgroups . IsCgroup2UnifiedMode () {
218+ if isCgroup2UnifiedMode () {
220219 resources .Unified = map [string ]string {
221220 // Ask the kernel to kill all processes in the container cgroup in case of OOM.
222221 // See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
@@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
298297 }
299298 return cStatusResources
300299}
300+
301+ // Note: this function variable is being added here so it would be possible to mock
302+ // the cgroup version for unit tests by assigning a new mocked function into it. Without it,
303+ // the cgroup version would solely depend on the environment running the test.
304+ var isCgroup2UnifiedMode = func () bool {
305+ return libcontainercgroups .IsCgroup2UnifiedMode ()
306+ }
307+
308+ type swapConfigurationHelper struct {
309+ machineInfo cadvisorv1.MachineInfo
310+ }
311+
312+ func newSwapConfigurationHelper (machineInfo cadvisorv1.MachineInfo ) * swapConfigurationHelper {
313+ return & swapConfigurationHelper {machineInfo : machineInfo }
314+ }
315+
316+ func (m swapConfigurationHelper ) ConfigureLimitedSwap (lcr * runtimeapi.LinuxContainerResources , pod * v1.Pod , container * v1.Container ) {
317+ podQos := kubeapiqos .GetPodQOS (pod )
318+ containerDoesNotRequestMemory := container .Resources .Requests .Memory ().IsZero () && container .Resources .Limits .Memory ().IsZero ()
319+ memoryRequestEqualsToLimit := container .Resources .Requests .Memory ().Cmp (* container .Resources .Limits .Memory ()) == 0
320+
321+ if podQos != v1 .PodQOSBurstable || containerDoesNotRequestMemory || ! isCgroup2UnifiedMode () || memoryRequestEqualsToLimit {
322+ m .ConfigureNoSwap (lcr )
323+ return
324+ }
325+
326+ containerMemoryRequest := container .Resources .Requests .Memory ()
327+ swapLimit , err := calcSwapForBurstablePods (containerMemoryRequest .Value (), int64 (m .machineInfo .MemoryCapacity ), int64 (m .machineInfo .SwapCapacity ))
328+
329+ if err != nil {
330+ klog .ErrorS (err , "cannot calculate swap allocation amount; disallowing swap" )
331+ m .ConfigureNoSwap (lcr )
332+ return
333+ }
334+
335+ m .configureSwap (lcr , swapLimit )
336+ }
337+
338+ func (m swapConfigurationHelper ) ConfigureNoSwap (lcr * runtimeapi.LinuxContainerResources ) {
339+ if ! isCgroup2UnifiedMode () {
340+ // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
341+ // Some swapping is still possible.
342+ // Note that if memory limit is 0, memory swap limit is ignored.
343+ lcr .MemorySwapLimitInBytes = lcr .MemoryLimitInBytes
344+ return
345+ }
346+
347+ m .configureSwap (lcr , 0 )
348+ }
349+
350+ func (m swapConfigurationHelper ) ConfigureUnlimitedSwap (lcr * runtimeapi.LinuxContainerResources ) {
351+ if ! isCgroup2UnifiedMode () {
352+ m .ConfigureNoSwap (lcr )
353+ return
354+ }
355+
356+ if lcr .Unified == nil {
357+ lcr .Unified = map [string ]string {}
358+ }
359+
360+ lcr .Unified [cm .Cgroup2MaxSwapFilename ] = "max"
361+ }
362+
363+ func (m swapConfigurationHelper ) configureSwap (lcr * runtimeapi.LinuxContainerResources , swapMemory int64 ) {
364+ if ! isCgroup2UnifiedMode () {
365+ klog .ErrorS (fmt .Errorf ("swap configuration is not supported with cgroup v1" ), "swap configuration under cgroup v1 is unexpected" )
366+ return
367+ }
368+
369+ if lcr .Unified == nil {
370+ lcr .Unified = map [string ]string {}
371+ }
372+
373+ lcr .Unified [cm .Cgroup2MaxSwapFilename ] = fmt .Sprintf ("%d" , swapMemory )
374+ }
375+
376+ // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
377+ // For more info, please look at the following KEP: https://kep.k8s.io/2400
378+ func calcSwapForBurstablePods (containerMemoryRequest , nodeTotalMemory , totalPodsSwapAvailable int64 ) (int64 , error ) {
379+ if nodeTotalMemory <= 0 {
380+ return 0 , fmt .Errorf ("total node memory is 0" )
381+ }
382+ if containerMemoryRequest > nodeTotalMemory {
383+ return 0 , fmt .Errorf ("container request %d is larger than total node memory %d" , containerMemoryRequest , nodeTotalMemory )
384+ }
385+
386+ containerMemoryProportion := float64 (containerMemoryRequest ) / float64 (nodeTotalMemory )
387+ swapAllocation := containerMemoryProportion * float64 (totalPodsSwapAvailable )
388+
389+ return int64 (swapAllocation ), nil
390+ }
0 commit comments