Skip to content

Commit 95a6013

Browse files
Refactor Resource Monitoring (#6554)
1 parent 9bc6b36 commit 95a6013

File tree

8 files changed

+151
-230
lines changed

8 files changed

+151
-230
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<!-- https://learn.microsoft.com/dotnet/fundamentals/package-validation/diagnostic-ids -->
3+
<Suppressions xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
4+
<Suppression>
5+
<DiagnosticId>CP0002</DiagnosticId>
6+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_UseDeltaNrPeriodsForCpuCalculation</Target>
7+
<Left>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
8+
<Right>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
9+
<IsBaselineSuppression>true</IsBaselineSuppression>
10+
</Suppression>
11+
<Suppression>
12+
<DiagnosticId>CP0002</DiagnosticId>
13+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_UseDeltaNrPeriodsForCpuCalculation(System.Boolean)</Target>
14+
<Left>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
15+
<Right>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
16+
<IsBaselineSuppression>true</IsBaselineSuppression>
17+
</Suppression>
18+
<Suppression>
19+
<DiagnosticId>CP0002</DiagnosticId>
20+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_UseDeltaNrPeriodsForCpuCalculation</Target>
21+
<Left>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
22+
<Right>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
23+
<IsBaselineSuppression>true</IsBaselineSuppression>
24+
</Suppression>
25+
<Suppression>
26+
<DiagnosticId>CP0002</DiagnosticId>
27+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_UseDeltaNrPeriodsForCpuCalculation(System.Boolean)</Target>
28+
<Left>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
29+
<Right>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
30+
<IsBaselineSuppression>true</IsBaselineSuppression>
31+
</Suppression>
32+
<Suppression>
33+
<DiagnosticId>CP0002</DiagnosticId>
34+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_UseDeltaNrPeriodsForCpuCalculation</Target>
35+
<Left>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
36+
<Right>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
37+
<IsBaselineSuppression>true</IsBaselineSuppression>
38+
</Suppression>
39+
<Suppression>
40+
<DiagnosticId>CP0002</DiagnosticId>
41+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_UseDeltaNrPeriodsForCpuCalculation(System.Boolean)</Target>
42+
<Left>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
43+
<Right>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
44+
<IsBaselineSuppression>true</IsBaselineSuppression>
45+
</Suppression>
46+
47+
<Suppression>
48+
<DiagnosticId>CP0002</DiagnosticId>
49+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_CalculateCpuUsageWithoutHostDelta</Target>
50+
<Left>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
51+
<Right>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
52+
<IsBaselineSuppression>true</IsBaselineSuppression>
53+
</Suppression>
54+
<Suppression>
55+
<DiagnosticId>CP0002</DiagnosticId>
56+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_CalculateCpuUsageWithoutHostDelta(System.Boolean)</Target>
57+
<Left>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
58+
<Right>lib/net462/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
59+
<IsBaselineSuppression>true</IsBaselineSuppression>
60+
</Suppression>
61+
<Suppression>
62+
<DiagnosticId>CP0002</DiagnosticId>
63+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_CalculateCpuUsageWithoutHostDelta</Target>
64+
<Left>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
65+
<Right>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
66+
<IsBaselineSuppression>true</IsBaselineSuppression>
67+
</Suppression>
68+
<Suppression>
69+
<DiagnosticId>CP0002</DiagnosticId>
70+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_CalculateCpuUsageWithoutHostDelta(System.Boolean)</Target>
71+
<Left>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
72+
<Right>lib/net8.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
73+
<IsBaselineSuppression>true</IsBaselineSuppression>
74+
</Suppression>
75+
<Suppression>
76+
<DiagnosticId>CP0002</DiagnosticId>
77+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.get_CalculateCpuUsageWithoutHostDelta</Target>
78+
<Left>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
79+
<Right>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
80+
<IsBaselineSuppression>true</IsBaselineSuppression>
81+
</Suppression>
82+
<Suppression>
83+
<DiagnosticId>CP0002</DiagnosticId>
84+
<Target>M:Microsoft.Extensions.Diagnostics.ResourceMonitoring.ResourceMonitoringOptions.set_CalculateCpuUsageWithoutHostDelta(System.Boolean)</Target>
85+
<Left>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Left>
86+
<Right>lib/net9.0/Microsoft.Extensions.Diagnostics.ResourceMonitoring.dll</Right>
87+
<IsBaselineSuppression>true</IsBaselineSuppression>
88+
</Suppression>
89+
</Suppressions>

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs

Lines changed: 49 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,16 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
1717
{
1818
private const double One = 1.0;
1919
private const long Hundred = 100L;
20-
private const double CpuLimitThreshold110Percent = 1.1;
2120

22-
// Meters to track CPU utilization threshold exceedances
23-
private readonly Counter<long>? _cpuUtilizationLimit100PercentExceededCounter;
24-
private readonly Counter<long>? _cpuUtilizationLimit110PercentExceededCounter;
25-
26-
private readonly bool _useDeltaNrPeriods;
2721
private readonly object _cpuLocker = new();
2822
private readonly object _memoryLocker = new();
2923
private readonly ILogger<LinuxUtilizationProvider> _logger;
3024
private readonly ILinuxUtilizationParser _parser;
3125
private readonly ulong _memoryLimit;
26+
private readonly long _cpuPeriodsInterval;
3227
private readonly TimeSpan _cpuRefreshInterval;
3328
private readonly TimeSpan _memoryRefreshInterval;
3429
private readonly TimeProvider _timeProvider;
35-
private readonly double _scaleRelativeToCpuLimit;
36-
private readonly double _scaleRelativeToCpuRequest;
3730
private readonly double _scaleRelativeToCpuRequestForTrackerApi;
3831

3932
private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);
@@ -42,18 +35,11 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
4235

4336
private DateTimeOffset _refreshAfterCpu;
4437
private DateTimeOffset _refreshAfterMemory;
45-
46-
// Track the actual timestamp when we read CPU values
47-
private DateTimeOffset _lastCpuMeasurementTime;
48-
4938
private double _cpuPercentage = double.NaN;
5039
private double _lastCpuCoresUsed = double.NaN;
5140
private double _memoryPercentage;
5241
private long _previousCgroupCpuTime;
5342
private long _previousHostCpuTime;
54-
private long _cpuUtilizationLimit100PercentExceeded;
55-
private long _cpuUtilizationLimit110PercentExceeded;
56-
private long _cpuPeriodsInterval;
5743
private long _previousCgroupCpuPeriodCounter;
5844
public SystemResources Resources { get; }
5945

@@ -66,7 +52,6 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
6652
DateTimeOffset now = _timeProvider.GetUtcNow();
6753
_cpuRefreshInterval = options.Value.CpuConsumptionRefreshInterval;
6854
_memoryRefreshInterval = options.Value.MemoryConsumptionRefreshInterval;
69-
_useDeltaNrPeriods = options.Value.UseDeltaNrPeriodsForCpuCalculation;
7055
_refreshAfterCpu = now;
7156
_refreshAfterMemory = now;
7257
_memoryLimit = _parser.GetAvailableMemoryInBytes();
@@ -76,8 +61,8 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
7661
float hostCpus = _parser.GetHostCpuCount();
7762
float cpuLimit = _parser.GetCgroupLimitedCpus();
7863
float cpuRequest = _parser.GetCgroupRequestCpu();
79-
_scaleRelativeToCpuLimit = hostCpus / cpuLimit;
80-
_scaleRelativeToCpuRequest = hostCpus / cpuRequest;
64+
float scaleRelativeToCpuLimit = hostCpus / cpuLimit;
65+
float scaleRelativeToCpuRequest = hostCpus / cpuRequest;
8166
_scaleRelativeToCpuRequestForTrackerApi = hostCpus; // the division by cpuRequest is performed later on in the ResourceUtilization class
8267

8368
#pragma warning disable CA2000 // Dispose objects before losing scope
@@ -87,46 +72,40 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
8772
var meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
8873
#pragma warning restore CA2000 // Dispose objects before losing scope
8974

90-
if (options.Value.CalculateCpuUsageWithoutHostDelta)
75+
if (options.Value.UseLinuxCalculationV2)
9176
{
9277
cpuLimit = _parser.GetCgroupLimitV2();
93-
94-
// Try to get the CPU request from cgroup
9578
cpuRequest = _parser.GetCgroupRequestCpuV2();
9679

9780
// Get Cpu periods interval from cgroup
9881
_cpuPeriodsInterval = _parser.GetCgroupPeriodsIntervalInMicroSecondsV2();
9982
(_previousCgroupCpuTime, _previousCgroupCpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
10083

101-
// Initialize the counters
102-
_cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_100_percent_exceeded");
103-
_cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_110_percent_exceeded");
104-
10584
_ = meter.CreateObservableGauge(
10685
ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
10786
() => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
10887
"1");
10988

11089
_ = meter.CreateObservableGauge(
11190
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
112-
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationWithoutHostDelta() / cpuRequest),
91+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationRequest(cpuRequest)),
11392
unit: "1");
11493
}
11594
else
11695
{
11796
_ = meter.CreateObservableGauge(
11897
name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
119-
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuLimit),
98+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuLimit),
12099
unit: "1");
121100

122101
_ = meter.CreateObservableGauge(
123102
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
124-
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
103+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuRequest),
125104
unit: "1");
126105

127106
_ = meter.CreateObservableGauge(
128107
name: ResourceUtilizationInstruments.ProcessCpuUtilization,
129-
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
108+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * scaleRelativeToCpuRequest),
130109
unit: "1");
131110
}
132111

@@ -148,10 +127,9 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
148127
_logger.SystemResourcesInfo(cpuLimit, cpuRequest, _memoryLimit, _memoryLimit);
149128
}
150129

151-
public double CpuUtilizationWithoutHostDelta()
130+
public double CpuUtilizationV2()
152131
{
153132
DateTimeOffset now = _timeProvider.GetUtcNow();
154-
double actualElapsedNanoseconds = (now - _lastCpuMeasurementTime).TotalNanoseconds;
155133
lock (_cpuLocker)
156134
{
157135
if (now < _refreshAfterCpu)
@@ -160,79 +138,34 @@ public double CpuUtilizationWithoutHostDelta()
160138
}
161139
}
162140

163-
var (cpuUsageTime, cpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
141+
(long cpuUsageTime, long cpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
164142
lock (_cpuLocker)
165143
{
166-
if (now >= _refreshAfterCpu)
144+
if (now < _refreshAfterCpu)
167145
{
168-
long deltaCgroup = cpuUsageTime - _previousCgroupCpuTime;
169-
double coresUsed;
170-
171-
if (_useDeltaNrPeriods)
172-
{
173-
long deltaPeriodCount = cpuPeriodCounter - _previousCgroupCpuPeriodCounter;
174-
long deltaCpuPeriodInNanoseconds = deltaPeriodCount * _cpuPeriodsInterval * 1000;
175-
176-
if (deltaCgroup > 0 && deltaPeriodCount > 0)
177-
{
178-
coresUsed = deltaCgroup / (double)deltaCpuPeriodInNanoseconds;
179-
180-
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
181-
182-
_lastCpuCoresUsed = coresUsed;
183-
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
184-
_previousCgroupCpuTime = cpuUsageTime;
185-
_previousCgroupCpuPeriodCounter = cpuPeriodCounter;
186-
}
187-
}
188-
else
189-
{
190-
if (deltaCgroup > 0)
191-
{
192-
coresUsed = deltaCgroup / actualElapsedNanoseconds;
193-
194-
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, actualElapsedNanoseconds, coresUsed);
195-
196-
_lastCpuCoresUsed = coresUsed;
197-
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
198-
_previousCgroupCpuTime = cpuUsageTime;
199-
200-
// Update the timestamp for next calculation
201-
_lastCpuMeasurementTime = now;
202-
}
203-
}
146+
return _lastCpuCoresUsed;
204147
}
205-
}
206148

207-
return _lastCpuCoresUsed;
208-
}
149+
long deltaCgroup = cpuUsageTime - _previousCgroupCpuTime;
150+
long deltaPeriodCount = cpuPeriodCounter - _previousCgroupCpuPeriodCounter;
209151

210-
/// <summary>
211-
/// Calculates CPU utilization relative to the CPU limit.
212-
/// </summary>
213-
/// <param name="cpuLimit">The CPU limit to use for the calculation.</param>
214-
/// <returns>CPU usage as a ratio of the limit.</returns>
215-
public double CpuUtilizationLimit(float cpuLimit)
216-
{
217-
double utilization = CpuUtilizationWithoutHostDelta() / cpuLimit;
152+
if (deltaCgroup <= 0 || deltaPeriodCount <= 0)
153+
{
154+
return _lastCpuCoresUsed;
155+
}
218156

219-
// Increment counter if utilization exceeds 1 (100%)
220-
if (utilization > 1.0)
221-
{
222-
_cpuUtilizationLimit100PercentExceededCounter?.Add(1);
223-
_cpuUtilizationLimit100PercentExceeded++;
224-
_logger.CounterMessage100(_cpuUtilizationLimit100PercentExceeded);
225-
}
157+
long deltaCpuPeriodInNanoseconds = deltaPeriodCount * _cpuPeriodsInterval * 1000;
158+
double coresUsed = deltaCgroup / (double)deltaCpuPeriodInNanoseconds;
226159

227-
// Increment counter if utilization exceeds 110%
228-
if (utilization > CpuLimitThreshold110Percent)
229-
{
230-
_cpuUtilizationLimit110PercentExceededCounter?.Add(1);
231-
_cpuUtilizationLimit110PercentExceeded++;
232-
_logger.CounterMessage110(_cpuUtilizationLimit110PercentExceeded);
160+
_logger.CpuUsageDataV2(cpuUsageTime, _previousCgroupCpuTime, deltaCpuPeriodInNanoseconds, coresUsed);
161+
162+
_lastCpuCoresUsed = coresUsed;
163+
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
164+
_previousCgroupCpuTime = cpuUsageTime;
165+
_previousCgroupCpuPeriodCounter = cpuPeriodCounter;
233166
}
234167

235-
return utilization;
168+
return _lastCpuCoresUsed;
236169
}
237170

238171
public double CpuUtilization()
@@ -252,23 +185,27 @@ public double CpuUtilization()
252185

253186
lock (_cpuLocker)
254187
{
255-
if (now >= _refreshAfterCpu)
188+
if (now < _refreshAfterCpu)
256189
{
257-
long deltaHost = hostCpuTime - _previousHostCpuTime;
258-
long deltaCgroup = cgroupCpuTime - _previousCgroupCpuTime;
259-
260-
if (deltaHost > 0 && deltaCgroup > 0)
261-
{
262-
double percentage = Math.Min(One, (double)deltaCgroup / deltaHost);
190+
return _cpuPercentage;
191+
}
263192

264-
_logger.CpuUsageData(cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
193+
long deltaHost = hostCpuTime - _previousHostCpuTime;
194+
long deltaCgroup = cgroupCpuTime - _previousCgroupCpuTime;
265195

266-
_cpuPercentage = percentage;
267-
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
268-
_previousCgroupCpuTime = cgroupCpuTime;
269-
_previousHostCpuTime = hostCpuTime;
270-
}
196+
if (deltaHost <= 0 || deltaCgroup <= 0)
197+
{
198+
return _cpuPercentage;
271199
}
200+
201+
double percentage = Math.Min(One, (double)deltaCgroup / deltaHost);
202+
203+
_logger.CpuUsageData(cgroupCpuTime, hostCpuTime, _previousCgroupCpuTime, _previousHostCpuTime, percentage);
204+
205+
_cpuPercentage = percentage;
206+
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
207+
_previousCgroupCpuTime = cgroupCpuTime;
208+
_previousHostCpuTime = hostCpuTime;
272209
}
273210

274211
return _cpuPercentage;
@@ -351,4 +288,9 @@ ex is System.IO.DirectoryNotFoundException ||
351288
return Enumerable.Empty<Measurement<double>>();
352289
}
353290
}
291+
292+
// Math.Min() is used below to mitigate margin errors and various kinds of precisions losses
293+
// due to the fact that the calculation itself is not an atomic operation:
294+
private double CpuUtilizationRequest(double cpuRequest) => Math.Min(One, CpuUtilizationV2() / cpuRequest);
295+
private double CpuUtilizationLimit(double cpuLimit) => Math.Min(One, CpuUtilizationV2() / cpuLimit);
354296
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Log.cs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,7 @@ public static partial void CpuUsageDataV2(
5050
double actualElapsedNanoseconds,
5151
double cpuCores);
5252

53-
[LoggerMessage(5, LogLevel.Debug,
54-
"CPU utilization exceeded 100%: Counter = {counterValue}")]
55-
public static partial void CounterMessage100(
56-
this ILogger logger,
57-
long counterValue);
58-
59-
[LoggerMessage(6, LogLevel.Debug,
60-
"CPU utilization exceeded 110%: Counter = {counterValue}")]
61-
public static partial void CounterMessage110(
62-
this ILogger logger,
63-
long counterValue);
64-
65-
[LoggerMessage(7, LogLevel.Warning,
53+
[LoggerMessage(5, LogLevel.Warning,
6654
"Error while getting disk stats: Error={errorMessage}")]
6755
public static partial void HandleDiskStatsException(
6856
this ILogger logger,

0 commit comments

Comments
 (0)