Skip to content

Commit 2b959bc

Browse files
himani2411Himani Anil Deshpande
andauthored
[Gb200] Support IMEX configuration to be local to a node (#3029)
* we remove /opt/parallelcluster/shared/nvidia-imex directory creation * We keep default path of `/etc/nvidia-imex/nodes_config.cfg` and `/etc/nvidia-imex/config.cfg` for IMEX configuration * We override `/etc/nvidia-imex/nodes_config.cfg` only if it is missing to avoid Imex start failures. * Update unit test Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent e81104a commit 2b959bc

File tree

4 files changed

+83
-61
lines changed

4 files changed

+83
-61
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
end
2525

2626
# nvidia-imex
27-
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
2827
default['cluster']['nvidia']['imex']['force_configuration'] = false
2928

3029
# NVIDIA NVLSM

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,51 @@
1919
return unless nvidia_enabled_or_installed?
2020
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso")
2121

22-
directory node['cluster']['nvidia']['imex']['shared_dir']
23-
2422
action_install_imex
23+
24+
# Create Imex configuration files
25+
action_create_configuration_files
2526
# Save Imex version in Node Attributes for InSpec Tests
2627
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version
2728
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package
2829
node_attributes 'dump node attributes'
2930
end
3031

32+
action :create_configuration_files do
33+
# We create or update IMEX configuration files if ParallelCluster is installing IMEX
34+
template nvidia_imex_nodes_conf_file do
35+
source 'nvidia-imex/nvidia-imex-nodes.erb'
36+
owner 'root'
37+
group 'root'
38+
mode '0755'
39+
action :create
40+
end
41+
42+
template nvidia_imex_main_conf_file do
43+
source 'nvidia-imex/nvidia-imex-config.erb'
44+
owner 'root'
45+
group 'root'
46+
mode '0755'
47+
action :create
48+
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
49+
end
50+
51+
# We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file.
52+
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
53+
source 'nvidia-imex/nvidia-imex.service.erb'
54+
owner 'root'
55+
group 'root'
56+
mode '0644'
57+
action :create
58+
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
59+
end
60+
end
61+
3162
action :configure do
3263
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
3364
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
3465
if is_gb200_node? || enable_force_configuration?
35-
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
36-
# if one doesn't already exist in a common, shared location.
66+
# Create the file if this is missing otherwise Imex service will not start
3767
template nvidia_imex_nodes_conf_file do
3868
source 'nvidia-imex/nvidia-imex-nodes.erb'
3969
owner 'root'
@@ -42,24 +72,6 @@
4272
action :create_if_missing
4373
end
4474

45-
template nvidia_imex_main_conf_file do
46-
source 'nvidia-imex/nvidia-imex-config.erb'
47-
owner 'root'
48-
group 'root'
49-
mode '0755'
50-
action :create_if_missing
51-
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
52-
end
53-
54-
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
55-
source 'nvidia-imex/nvidia-imex.service.erb'
56-
owner 'root'
57-
group 'root'
58-
mode '0644'
59-
action :create
60-
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
61-
end
62-
6375
service nvidia_imex_service do
6476
action %i(enable start)
6577
supports status: true
@@ -92,11 +104,11 @@ def nvidia_enabled_or_installed?
92104
end
93105

94106
def nvidia_imex_main_conf_file
95-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
107+
"/etc/nvidia-imex/config.cfg"
96108
end
97109

98110
def nvidia_imex_nodes_conf_file
99-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
111+
"/etc/nvidia-imex/nodes_config.cfg"
100112
end
101113

102114
def enable_force_configuration?

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
nvidia_version = "1.2.3"
44
SOURCE_DIR = 'SOURCE_DIR'.freeze
5-
nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex"
5+
nvidia_imex_dir = "/etc/nvidia-imex"
6+
imex_main_conf_file = "#{nvidia_imex_dir}/config.cfg"
7+
imex_nodes_conf_file = "#{nvidia_imex_dir}/nodes_config.cfg"
8+
imex_service_file = "/etc/systemd/system/nvidia-imex.service"
69
imex_binary = '/usr/bin/nvidia-imex'
710
imex_ctl_binary = '/usr/bin/nvidia-imex-ctl'
8-
queue_name = 'queue-name'
9-
compute_resource_name = 'compute-resource-name'
1011
cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.aws_region.AWS_DOMAIN'
1112

1213
class ConvergeNvidiaImex
@@ -18,6 +19,14 @@ def self.install(chef_run)
1819
end
1920
end
2021

22+
def self.create_configuration_files(chef_run)
23+
chef_run.converge_dsl('aws-parallelcluster-platform') do
24+
nvidia_imex 'create_configuration_files' do
25+
action :create_configuration_files
26+
end
27+
end
28+
end
29+
2130
def self.configure(chef_run)
2231
chef_run.converge_dsl('aws-parallelcluster-platform') do
2332
nvidia_imex 'configure' do
@@ -231,7 +240,6 @@ def self.configure(chef_run)
231240
cached(:node) { chef_run.node }
232241

233242
before do
234-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
235243
chef_run.node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url
236244
chef_run.node.override['cluster']['region'] = 'aws_region'
237245
chef_run.node.override['cluster']['sources_dir'] = SOURCE_DIR
@@ -241,7 +249,6 @@ def self.configure(chef_run)
241249
end
242250
if platform == 'amazon' && version == '2'
243251
it 'does not install nvidia-imex' do
244-
is_expected.not_to create_directory(nvidia_imex_shared_dir)
245252
is_expected.not_to install_install_packages('Install nvidia-imex')
246253
.with(packages: "#{nvidia_imex_name}")
247254
.with(action: %i(install))
@@ -254,7 +261,6 @@ def self.configure(chef_run)
254261
else
255262

256263
it 'installs nvidia-imex' do
257-
is_expected.to create_directory(nvidia_imex_shared_dir)
258264
if platform == 'ubuntu'
259265
is_expected.to create_if_missing_remote_file("#{SOURCE_DIR}/#{nvidia_imex_package}-#{nvidia_imex_version}.deb").with(
260266
source: "#{cluster_artifacts_s3_url}/dependencies/nvidia_imex/#{url_suffix}.deb",
@@ -294,6 +300,38 @@ def self.configure(chef_run)
294300
end
295301
end
296302

303+
describe 'nvidia_imex:create_configuration_files' do
304+
for_all_oses do |platform, version|
305+
context "on #{platform}#{version}" do
306+
cached(:chef_run) do
307+
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
308+
ConvergeNvidiaImex.create_configuration_files(runner)
309+
end
310+
cached(:node) { chef_run.node }
311+
312+
it 'does create Imex configuration files' do
313+
is_expected.to create_template("#{imex_nodes_conf_file}")
314+
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
315+
.with(user: 'root')
316+
.with(group: 'root')
317+
.with(mode: '0755')
318+
is_expected.to create_template("#{imex_main_conf_file}")
319+
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
320+
.with(user: 'root')
321+
.with(group: 'root')
322+
.with(mode: '0755')
323+
.with(variables: { imex_nodes_config_file_path: "#{imex_nodes_conf_file}" })
324+
is_expected.to create_template(imex_service_file)
325+
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
326+
.with(user: 'root')
327+
.with(group: 'root')
328+
.with(mode: '0644')
329+
.with(variables: { imex_main_config_file_path: "#{imex_main_conf_file}" })
330+
end
331+
end
332+
end
333+
end
334+
297335
describe 'nvidia_imex:configure' do
298336
[%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator|
299337
for_all_oses do |platform, version|
@@ -329,54 +367,27 @@ def self.configure(chef_run)
329367
before do
330368
chef_run.node.override['cluster']['region'] = 'aws_region'
331369
chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator
332-
chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir
333370
chef_run.node.override['cluster']['node_type'] = node_type
334-
chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name
335-
chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name
336371

337372
ConvergeNvidiaImex.configure(chef_run)
338373
end
339374

340375
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
341376
it 'does not configure nvidia-imex' do
342-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
377+
is_expected.not_to create_if_missing_template("#{imex_nodes_conf_file}")
343378
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
344379
.with(user: 'root')
345380
.with(group: 'root')
346381
.with(mode: '0755')
347-
is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
348-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
349-
.with(user: 'root')
350-
.with(group: 'root')
351-
.with(mode: '0755')
352-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
353-
is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service")
354-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
355-
.with(user: 'root')
356-
.with(group: 'root')
357-
.with(mode: '0644')
358-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
359382
is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
360383
end
361384
else
362385
it 'it starts nvidia-imex service' do
363-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg")
386+
is_expected.to create_if_missing_template("#{imex_nodes_conf_file}")
364387
.with(source: 'nvidia-imex/nvidia-imex-nodes.erb')
365388
.with(user: 'root')
366389
.with(group: 'root')
367390
.with(mode: '0755')
368-
is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg")
369-
.with(source: 'nvidia-imex/nvidia-imex-config.erb')
370-
.with(user: 'root')
371-
.with(group: 'root')
372-
.with(mode: '0755')
373-
.with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" })
374-
is_expected.to create_template("/etc/systemd/system/nvidia-imex.service")
375-
.with(source: 'nvidia-imex/nvidia-imex.service.erb')
376-
.with(user: 'root')
377-
.with(group: 'root')
378-
.with(mode: '0644')
379-
.with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" })
380391
is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true })
381392
end
382393
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_imex_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
its('owner') { should eq 'root' }
3737
its('group') { should eq 'root' }
3838
its('mode') { should cmp '0644' }
39-
its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c #{node['cluster']['nvidia']['imex']['shared_dir']}} }
39+
its('content') { should match %r{ExecStart=/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg} }
4040
end
4141

4242
describe service('nvidia-imex') do

0 commit comments

Comments
 (0)