From 301667b4522cdf165094b7619b5e13fcf9c90c9a Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 28 Jul 2025 15:30:03 -0400 Subject: [PATCH] scheduler: feasibility check that memory_max fits in total The `resource.memory_max` field is intended to allow memory oversubscription, so we don't check it in the `AllocsFit` method where we're totalling up all the request memory for all allocs on a node. But we never check that the value can even fit in the maximum amount of memory on the node, which can result in nonsensical placements. When iterating nodes in the feasibility check phase, check that the `memory_max` field doesn't exceed the total amount of memory on the node. Note that this deliberately ignores over "reserved memory", as the feature is intended to allow oversubscription. Fixes: https://github.com/hashicorp/nomad/issues/26360 --- scheduler/feasible/rank.go | 7 ++++ scheduler/feasible/rank_test.go | 40 ++++++++++++------- .../content/docs/upgrade/upgrade-specific.mdx | 7 +++- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/scheduler/feasible/rank.go b/scheduler/feasible/rank.go index e2c94161961..9334c169158 100644 --- a/scheduler/feasible/rank.go +++ b/scheduler/feasible/rank.go @@ -383,6 +383,13 @@ NEXTNODE: if iter.memoryOversubscription { taskResources.Memory.MemoryMaxMB = safemath.Add( int64(task.Resources.MemoryMaxMB), int64(task.Resources.SecretsMB)) + + if taskResources.Memory.MemoryMaxMB > option.Node.NodeResources.Memory.MemoryMB { + iter.ctx.Metrics().FilterNode(option.Node, + "task memory_max exceeds maximum available memory") + netIdx.Release() + continue NEXTNODE + } } // Check if we need a network resource diff --git a/scheduler/feasible/rank_test.go b/scheduler/feasible/rank_test.go index 2657f492df7..af771e12aec 100644 --- a/scheduler/feasible/rank_test.go +++ b/scheduler/feasible/rank_test.go @@ -110,6 +110,18 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { }, }, }, + { + Node: &structs.Node{ + // Empty but memory_max won't fit + NodeResources: &structs.NodeResources{ + Processors: processorResources4096, + Cpu: legacyCpuResources4096, + Memory: structs.NodeMemoryResources{ + MemoryMB: 1024, + }, + }, + }, + }, } static := NewStaticRankIterator(ctx, nodes) @@ -119,8 +131,9 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { { Name: "web", Resources: &structs.Resources{ - CPU: 1024, - MemoryMB: 1024, + CPU: 1024, + MemoryMB: 1024, + MemoryMaxMB: 2048, }, }, }, @@ -132,19 +145,18 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { scoreNorm := NewScoreNormalizationIterator(ctx, binp) out := collectRanked(scoreNorm) - if len(out) != 2 { - t.Fatalf("Bad: %v", out) - } - if out[0] != nodes[0] || out[1] != nodes[2] { - t.Fatalf("Bad: %v", out) - } + must.Len(t, 2, out, must.Sprint("expected 2 scored nodes")) + + must.Eq(t, nodes[0], out[0], must.Sprint("expected nodes[0] first")) + must.Eq(t, nodes[2], out[1], must.Sprint("expected nodes[2] second")) + must.Eq(t, 1.0, out[0].FinalScore, must.Sprint("bad score")) + must.Greater(t, 0.50, out[1].FinalScore, must.Sprint("bad score")) + must.Less(t, 0.60, out[1].FinalScore, must.Sprint("bad score")) + + // un-feasible nodes node[1] and node[3] are not scored + must.Eq(t, 2, + ctx.metrics.ConstraintFiltered["task memory_max exceeds maximum available memory"]) - if out[0].FinalScore != 1.0 { - t.Fatalf("Bad Score: %v", out[0].FinalScore) - } - if out[1].FinalScore < 0.50 || out[1].FinalScore > 0.60 { - t.Fatalf("Bad Score: %v", out[1].FinalScore) - } } // TestBinPackIterator_NoExistingAlloc_MixedReserve asserts that node's with diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 9254888f971..fcd1dd1adb1 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -18,7 +18,12 @@ used to document those details separately from the standard upgrade flow. In Nomad 1.11.0, submitting a sysbatch job with a `reschedule` block returns an error instead of being silently ignored, as it was in previous versions. The -same behavior applies to system jobs. +same behavior applies to system jobs. + +#### Memory oversubscription checked against total memory on node + +In Nomad 1.11.0, the scheduler checks that that the `resources.memory_max` of a +task doesn't exceed the total memory on a given node. ## Nomad 1.10.2