From 3fe23d35ac8669c2bf87c9de940ea531a6e5b550 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 10 Apr 2019 12:00:27 +0200 Subject: [PATCH 01/50] Add possibility to test kitchen the AMI with custom node package The custom node package could be passed as URL pointing to the node archive Signed-off-by: Luca Carrogu --- .kitchen.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.kitchen.yml b/.kitchen.yml index 6324e48a8d..6e71fed242 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -68,6 +68,7 @@ suites: cfn_ephemeral_dir: <%= ENV['CFN_EPHEMERAL_DIR'] %> cfn_shared_dir: <%= ENV['CFN_SHARED_DIR'] %> cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: sge_config_MasterServer run_list: @@ -87,6 +88,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: torque_config_MasterServer run_list: @@ -106,6 +108,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: slurm_config_MasterServer run_list: @@ -125,6 +128,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: sge_config_ComputeFleet run_list: @@ -144,6 +148,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: torque_config_ComputeFleet run_list: @@ -163,6 +168,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> - name: slurm_config_ComputeFleet run_list: @@ -182,3 +188,4 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> From 523f7c8b13248f835a8ee7204ecba13b3e53029d Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 11 Apr 2019 11:39:34 +0200 Subject: [PATCH 02/50] Remove double quotes from post install args variable In this way the cfn_postinstall_args is aligned with the cfn_preinstall_args variable. Signed-off-by: Enrico Usai --- templates/default/cfnconfig.erb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/default/cfnconfig.erb b/templates/default/cfnconfig.erb index 6c75330bf1..1c9c55de69 100644 --- a/templates/default/cfnconfig.erb +++ b/templates/default/cfnconfig.erb @@ -2,7 +2,7 @@ stack_name=<%= node['cfncluster']['stack_name'] %> cfn_preinstall=<%= node['cfncluster']['cfn_preinstall'] %> cfn_preinstall_args=<%= node['cfncluster']['cfn_preinstall_args'] %> cfn_postinstall=<%= node['cfncluster']['cfn_postinstall'] %> -cfn_postinstall_args="<%= node['cfncluster']['cfn_postinstall_args'] %>" +cfn_postinstall_args=<%= node['cfncluster']['cfn_postinstall_args'] %> cfn_region=<%= node['cfncluster']['cfn_region'] %> cfn_scheduler=<%= node['cfncluster']['cfn_scheduler'] %> cfn_scheduler_slots=<%= node['cfncluster']['cfn_scheduler_slots'] %> From b25404bf5be9a9eb8f1aba1b2da1a05d9f638a8d Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 11 Apr 2019 16:23:12 +0200 Subject: [PATCH 03/50] Remove duplicated (and wrong) mount point for ebs volumes This mount point is wrong when the customer is using multiple ebs volumes because the cfn_shared_dir contains the comma separated list of the mount points. Furthermore the same action is performed in the same script, few lines below, by splitting by comma. Signed-off-by: Enrico Usai --- recipes/_compute_base_config.rb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/recipes/_compute_base_config.rb b/recipes/_compute_base_config.rb index 515fa41b6d..ad6273c644 100644 --- a/recipes/_compute_base_config.rb +++ b/recipes/_compute_base_config.rb @@ -13,15 +13,6 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -# Created shared mount point -directory node['cfncluster']['cfn_shared_dir'] do - mode '1777' - owner 'root' - group 'root' - recursive true - action :create -end - node.default['cfncluster']['cfn_master'] = node['cfncluster']['cfn_master'].split('.')[0] nfs_master = node['cfncluster']['cfn_master'] From 992a1e7333103d71b97dfe6faaca055cd552c0a0 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 12 Apr 2019 15:03:40 +0200 Subject: [PATCH 04/50] Pin supervisord to version 3.4.0 This is the latest version with Python 2.6 support Signed-off-by: Luca Carrogu --- attributes/default.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/attributes/default.rb b/attributes/default.rb index 8dd6533a90..35dfce816e 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -21,7 +21,7 @@ # Python packages default['cfncluster']['cfncluster-version'] = '2.3.1' default['cfncluster']['cfncluster-node-version'] = '2.3.1' -default['cfncluster']['cfncluster-supervisor-version'] = '3.3.1' +default['cfncluster']['supervisor-version'] = '3.4.0' # URLs to software packages used during install recipes # Gridengine software default['cfncluster']['sge']['version'] = '8.1.9' From 454f4cf5d5f7516fc927b43ccf92491adf5e0607 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 8 Apr 2019 15:33:20 +0200 Subject: [PATCH 05/50] Bump version to 2.3.2 alpha 1 Signed-off-by: Luca Carrogu --- amis/packer_variables.json | 4 ++-- attributes/default.rb | 6 +++--- metadata.rb | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/amis/packer_variables.json b/amis/packer_variables.json index 456e6aa2c7..f12ba551c2 100644 --- a/amis/packer_variables.json +++ b/amis/packer_variables.json @@ -1,6 +1,6 @@ { - "parallelcluster_version": "2.3.1", - "parallelcluster_cookbook_version": "2.3.1", + "parallelcluster_version": "2.3.2a1", + "parallelcluster_cookbook_version": "2.3.2", "chef_version": "14.2.0", "ridley_version": "5.1.1", "berkshelf_version": "7.0.4" diff --git a/attributes/default.rb b/attributes/default.rb index 35dfce816e..5df4c04cf6 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -19,9 +19,9 @@ default['cfncluster']['scripts_dir'] = "#{node['cfncluster']['base_dir']}/scripts" default['cfncluster']['license_dir'] = "#{node['cfncluster']['base_dir']}/licenses" # Python packages -default['cfncluster']['cfncluster-version'] = '2.3.1' -default['cfncluster']['cfncluster-node-version'] = '2.3.1' -default['cfncluster']['supervisor-version'] = '3.4.0' +default['cfncluster']['cfncluster-version'] = '2.3.2a1' +default['cfncluster']['cfncluster-node-version'] = '2.3.2a1' +default['cfncluster']['cfncluster-supervisor-version'] = '3.4.0' # URLs to software packages used during install recipes # Gridengine software default['cfncluster']['sge']['version'] = '8.1.9' diff --git a/metadata.rb b/metadata.rb index d5750390f8..38981109fc 100644 --- a/metadata.rb +++ b/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '14.2.0' -version '2.3.1' +version '2.3.2' supports 'amazon' supports 'centos', '= 6' From 52e9500e95896640f0b1892445d5a65ff627d410 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 16 Apr 2019 10:06:22 +0200 Subject: [PATCH 06/50] Fix variable name Signed-off-by: Luca Carrogu --- attributes/default.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/attributes/default.rb b/attributes/default.rb index 5df4c04cf6..d16a68eb09 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -21,7 +21,7 @@ # Python packages default['cfncluster']['cfncluster-version'] = '2.3.2a1' default['cfncluster']['cfncluster-node-version'] = '2.3.2a1' -default['cfncluster']['cfncluster-supervisor-version'] = '3.4.0' +default['cfncluster']['supervisor-version'] = '3.4.0' # URLs to software packages used during install recipes # Gridengine software default['cfncluster']['sge']['version'] = '8.1.9' From 64dca67a7fef48ac2c98eb8b3ed78b8656eea7ed Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 15 Apr 2019 16:26:16 +0200 Subject: [PATCH 07/50] jobwatcher.cfg: remove compute_instance_type config value The jobwatcher now retrieves this value dynamically from the stack parameters Signed-off-by: Francesco De Martino --- templates/default/jobwatcher.cfg.erb | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/default/jobwatcher.cfg.erb b/templates/default/jobwatcher.cfg.erb index 84b7f25330..16f9c54d12 100644 --- a/templates/default/jobwatcher.cfg.erb +++ b/templates/default/jobwatcher.cfg.erb @@ -4,4 +4,3 @@ scheduler = <%= node['cfncluster']['cfn_scheduler'] %> stack_name = <%= node['cfncluster']['stack_name'] %> cfncluster_dir = <%= node['cfncluster']['base_dir'] %> proxy = <%= node['cfncluster']['cfn_proxy'] %> -compute_instance_type = <%= node['cfncluster']['compute_instance_type'] %> From 6701f977f6e4163d990912c5af90fd372405d34f Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 17 Apr 2019 16:17:59 +0200 Subject: [PATCH 08/50] sqswatcher.cfg: remove unneeded max_queue_size config value Signed-off-by: Francesco De Martino --- templates/default/sqswatcher.cfg.erb | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/default/sqswatcher.cfg.erb b/templates/default/sqswatcher.cfg.erb index 5e1b6b84bf..bc2e6e37b2 100644 --- a/templates/default/sqswatcher.cfg.erb +++ b/templates/default/sqswatcher.cfg.erb @@ -5,5 +5,4 @@ table_name = <%= node['cfncluster']['cfn_ddb_table'] %> scheduler = <%= node['cfncluster']['cfn_scheduler'] %> cluster_user = <%= node['cfncluster']['cfn_cluster_user'] %> proxy = <%= node['cfncluster']['cfn_proxy'] %> -max_queue_size = <%= node['cfncluster']['cfn_max_queue_size'] %> stack_name = <%= node['cfncluster']['stack_name'] %> From 782981585983a19ee09bcd4ab5651caaa701a066 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 17 Apr 2019 16:24:20 +0200 Subject: [PATCH 09/50] Install slurm libpmi/libpmi2 The libpmi is now in a separate slurm package see https://bugs.schedmd.com/show_bug.cgi?id=4511 so it needs to be installed explicitly This will solve https://github.com/aws/aws-parallelcluster/issues/1008 Signed-off-by: Luca Carrogu --- recipes/slurm_install.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/slurm_install.rb b/recipes/slurm_install.rb index d8abad4457..004be71281 100644 --- a/recipes/slurm_install.rb +++ b/recipes/slurm_install.rb @@ -40,6 +40,7 @@ CORES=$(grep processor /proc/cpuinfo | wc -l) make -j $CORES make install + make install-contrib SLURM # TODO: Fix, so it works for upgrade creates '/opt/slurm/bin/srun' From 47b983280a6879b4685ae60b531efaf02f73edf7 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 19 Apr 2019 23:17:18 +0200 Subject: [PATCH 10/50] Avoid failure when partition changes require a reboot The patch will let the script continue also when the following error is returned by the parted command: "Error: Partition(s) Y on /dev/XXX have been written, but we have been unable to inform the kernel of the change, probably because it/they are in use. As a result, the old partition(s) will remain in use. You should reboot now before making further changes." Signed-off-by: Luca Carrogu --- files/default/setup-ephemeral-drives.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/files/default/setup-ephemeral-drives.sh b/files/default/setup-ephemeral-drives.sh index b35201cd35..1b1c2c831d 100644 --- a/files/default/setup-ephemeral-drives.sh +++ b/files/default/setup-ephemeral-drives.sh @@ -21,6 +21,13 @@ function error_exit () { exit 1 } +function exec_command() { + _command_output=$($@ 2>&1) + _exit_code=$? + + # Do not set RC=1 if error says that changes have been written but a reboot is required to inform the kernel + [[ $_exit_code -ne 0 && $(echo "${_command_output}" | grep -i "you should reboot now") ]] && RC=1 +} # LVM stripe, format, mount ephemeral drives function setup_ephemeral_drives () { @@ -48,11 +55,11 @@ function setup_ephemeral_drives () { for d in $DEVS; do d=/dev/${d} dd if=/dev/zero of=${d} bs=32k count=1 || RC=1 - parted -s ${d} mklabel gpt || RC=1 - parted -s ${d} || RC=1 - parted -s -a optimal ${d} mkpart primary 1MB 100% || RC=1 - partprobe - parted -s ${d} set 1 lvm on || RC=1 + exec_command "parted -s ${d} mklabel gpt" + exec_command "parted -s ${d}" + exec_command "parted -s -a optimal ${d} mkpart primary 1MB 100%" + partprobe ${d} + exec_command "parted -s ${d} set 1 lvm on" if [ $IS_NVME -eq 1 ]; then PARTITIONS="${d}p1 $PARTITIONS" else From fe3abdc28387f6d6866a315ed2bc913de668a63f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 23 Apr 2019 19:09:04 +0200 Subject: [PATCH 11/50] Delete previous network configuration left by cloud-init This patch avoids network service restart failures when a configuration file of an old network interface (not present anymore in the current instance launch) was found Signed-off-by: Luca Carrogu --- files/default/ami_cleanup.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/files/default/ami_cleanup.sh b/files/default/ami_cleanup.sh index 57515fa011..124f5609ed 100644 --- a/files/default/ami_cleanup.sh +++ b/files/default/ami_cleanup.sh @@ -5,5 +5,12 @@ rm -rf /var/lib/cloud/instances/* rm -f /var/lib/cloud/instance rm -rf /etc/ssh/ssh_host_* rm -f /etc/udev/rules.d/70-persistent-net.rules +for ifcfg in $(ls /etc/sysconfig/network-scripts/ifcfg-*) +do + if [ "$(basename ${ifcfg})" != "ifcfg-lo" ] + then + rm -f "${ifcfg}" + fi +done find /var/log -type f -exec /bin/rm -v {} \; touch /var/log/lastlog \ No newline at end of file From 83c22f0f1e8f0a968333e0c0d34cea261872a702 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 23 Apr 2019 19:09:34 +0200 Subject: [PATCH 12/50] Execute cleanup script also for official released AMI Signed-off-by: Luca Carrogu --- amis/packer_alinux.json | 1 - amis/packer_centos6.json | 1 - amis/packer_centos7.json | 1 - amis/packer_ubuntu1404.json | 1 - amis/packer_ubuntu1604.json | 1 - 5 files changed, 5 deletions(-) diff --git a/amis/packer_alinux.json b/amis/packer_alinux.json index 494f34b165..87f305463d 100644 --- a/amis/packer_alinux.json +++ b/amis/packer_alinux.json @@ -246,7 +246,6 @@ }, { "type" : "shell", - "only": ["custom-alinux"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_centos6.json b/amis/packer_centos6.json index d2bc2df2fb..9d73a41fed 100644 --- a/amis/packer_centos6.json +++ b/amis/packer_centos6.json @@ -265,7 +265,6 @@ }, { "type" : "shell", - "only": ["custom-centos6"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_centos7.json b/amis/packer_centos7.json index fefa92c370..80bd33ed75 100644 --- a/amis/packer_centos7.json +++ b/amis/packer_centos7.json @@ -277,7 +277,6 @@ }, { "type" : "shell", - "only": ["custom-centos7"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_ubuntu1404.json b/amis/packer_ubuntu1404.json index 6edac7f68c..5630cc8156 100644 --- a/amis/packer_ubuntu1404.json +++ b/amis/packer_ubuntu1404.json @@ -271,7 +271,6 @@ }, { "type" : "shell", - "only": ["custom-ubuntu1404"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_ubuntu1604.json b/amis/packer_ubuntu1604.json index 350c6cfad3..4fb81b4e2e 100644 --- a/amis/packer_ubuntu1604.json +++ b/amis/packer_ubuntu1604.json @@ -274,7 +274,6 @@ }, { "type" : "shell", - "only": ["custom-ubuntu1604"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] From 4aba58cfcbb286719294d208f34ace1fb21dead9 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 23 Apr 2019 19:26:17 +0200 Subject: [PATCH 13/50] torque: restarting pbs_mom only when hostaname changes get applied This fixes the issue with torque on centos 7 Signed-off-by: Francesco De Martino --- recipes/_compute_torque_config.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/recipes/_compute_torque_config.rb b/recipes/_compute_torque_config.rb index a049c10abd..965775e11c 100644 --- a/recipes/_compute_torque_config.rb +++ b/recipes/_compute_torque_config.rb @@ -31,7 +31,10 @@ end # Enable and start pbs_mom service +# pbs_mom is restarted only after network service is restarted in +# order to wait for the hostname changes to be applied service "pbs_mom" do supports restart: true - action %i[enable restart] + action :enable + subscribes :restart, 'service[network]', :immediately end From da4eb01a970aa0f30d89c7e3c8012fc947121c4c Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 24 Apr 2019 10:07:51 +0200 Subject: [PATCH 14/50] ami_cleanup: remove network interface configs only if created by cloud-init Signed-off-by: Francesco De Martino --- files/default/ami_cleanup.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/files/default/ami_cleanup.sh b/files/default/ami_cleanup.sh index 124f5609ed..32b13c77a1 100644 --- a/files/default/ami_cleanup.sh +++ b/files/default/ami_cleanup.sh @@ -5,12 +5,6 @@ rm -rf /var/lib/cloud/instances/* rm -f /var/lib/cloud/instance rm -rf /etc/ssh/ssh_host_* rm -f /etc/udev/rules.d/70-persistent-net.rules -for ifcfg in $(ls /etc/sysconfig/network-scripts/ifcfg-*) -do - if [ "$(basename ${ifcfg})" != "ifcfg-lo" ] - then - rm -f "${ifcfg}" - fi -done +grep -l "Created by cloud-init on instance boot automatically" /etc/sysconfig/network-scripts/ifcfg-* | xargs rm -f find /var/log -type f -exec /bin/rm -v {} \; -touch /var/log/lastlog \ No newline at end of file +touch /var/log/lastlog From e0776747320f46c7479b87d25a2dda98d457f89f Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 24 Apr 2019 09:47:20 +0200 Subject: [PATCH 15/50] torque: starting pbs_mom and restart on network/hostname changes Signed-off-by: Francesco De Martino --- recipes/_compute_torque_config.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/recipes/_compute_torque_config.rb b/recipes/_compute_torque_config.rb index 965775e11c..4bd98b3543 100644 --- a/recipes/_compute_torque_config.rb +++ b/recipes/_compute_torque_config.rb @@ -35,6 +35,7 @@ # order to wait for the hostname changes to be applied service "pbs_mom" do supports restart: true - action :enable - subscribes :restart, 'service[network]', :immediately + action %i[enable start] + subscribes :restart, 'service[network]', :delayed + subscribes :restart, 'ohai[reload_hostname]', :delayed end From e2cd225eb0b688cdb2d9527b343b8b4fc910d89e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 24 Apr 2019 13:29:26 +0200 Subject: [PATCH 16/50] ami_cleanup: remove ifcfg-eth0 in centos7 Related bug https://bugs.centos.org/view.php?id=13836#c33128 Signed-off-by: Francesco De Martino --- files/default/ami_cleanup.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/files/default/ami_cleanup.sh b/files/default/ami_cleanup.sh index 32b13c77a1..c6993be854 100644 --- a/files/default/ami_cleanup.sh +++ b/files/default/ami_cleanup.sh @@ -6,5 +6,12 @@ rm -f /var/lib/cloud/instance rm -rf /etc/ssh/ssh_host_* rm -f /etc/udev/rules.d/70-persistent-net.rules grep -l "Created by cloud-init on instance boot automatically" /etc/sysconfig/network-scripts/ifcfg-* | xargs rm -f + +# https://bugs.centos.org/view.php?id=13836#c33128 +source /etc/os-release +if [ "${ID}${VERSION_ID}" == "centos7" ]; then + rm -f /etc/sysconfig/network-scripts/ifcfg-eth0 +fi + find /var/log -type f -exec /bin/rm -v {} \; touch /var/log/lastlog From 0d95ba061eddb6667a2761c0cf65bca0e6a356f5 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 24 Apr 2019 16:25:45 +0200 Subject: [PATCH 17/50] torque: use :immediately when subscribed to restart on service[network] Signed-off-by: Francesco De Martino --- recipes/_compute_torque_config.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/_compute_torque_config.rb b/recipes/_compute_torque_config.rb index 4bd98b3543..9db0347f92 100644 --- a/recipes/_compute_torque_config.rb +++ b/recipes/_compute_torque_config.rb @@ -36,6 +36,6 @@ service "pbs_mom" do supports restart: true action %i[enable start] - subscribes :restart, 'service[network]', :delayed + subscribes :restart, 'service[network]', :immediately subscribes :restart, 'ohai[reload_hostname]', :delayed end From ad4fe59867d802106d2c4340108d277a5fd6e9be Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 24 Apr 2019 16:36:18 +0200 Subject: [PATCH 18/50] Skip test for custom AMI Skip test if jq is not installed, because for custom ami it is installed during bootstrap time (inside cloudformation userdata) Signed-off-by: Luca Carrogu --- .kitchen.yml | 7 +++++++ recipes/tests.rb | 20 +++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.kitchen.yml b/.kitchen.yml index 6e71fed242..92f58f4b1b 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -69,6 +69,7 @@ suites: cfn_shared_dir: <%= ENV['CFN_SHARED_DIR'] %> cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: sge_config_MasterServer run_list: @@ -89,6 +90,7 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: torque_config_MasterServer run_list: @@ -109,6 +111,7 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: slurm_config_MasterServer run_list: @@ -129,6 +132,7 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: sge_config_ComputeFleet run_list: @@ -149,6 +153,7 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: torque_config_ComputeFleet run_list: @@ -169,6 +174,7 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: slurm_config_ComputeFleet run_list: @@ -189,3 +195,4 @@ suites: cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> diff --git a/recipes/tests.rb b/recipes/tests.rb index 239c4bf259..a17df82ac2 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,13 +99,15 @@ end end -bash 'execute jq' do - cwd Chef::Config[:file_cache_path] - code <<-JQMERGE - # Set PATH as in the UserData script of the CloudFormation template - export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" - echo '{"cfncluster": {"cfn_region": "eu-west-3"}, "run_list": "recipe[aws-parallelcluster::sge_config]"}' > /tmp/dna.json - echo '{ "cfncluster" : { "ganglia_enabled" : "yes" } }' > /tmp/extra.json - jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' || exit 1 - JQMERGE +unless node['cfncluster']['os'].end_with?("-custom") + bash 'execute jq' do + cwd Chef::Config[:file_cache_path] + code <<-JQMERGE + # Set PATH as in the UserData script of the CloudFormation template + export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" + echo '{"cfncluster": {"cfn_region": "eu-west-3"}, "run_list": "recipe[aws-parallelcluster::sge_config]"}' > /tmp/dna.json + echo '{ "cfncluster" : { "ganglia_enabled" : "yes" } }' > /tmp/extra.json + jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' || exit 1 + JQMERGE + end end From c0556ecc29a40d20225fe1d6bc40417c8759702d Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 26 Apr 2019 14:08:20 +0200 Subject: [PATCH 19/50] use full master FQDN when mounting nfs Signed-off-by: Francesco De Martino --- recipes/_compute_base_config.rb | 2 -- recipes/_compute_sge_config.rb | 2 +- recipes/_compute_slurm_config.rb | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes/_compute_base_config.rb b/recipes/_compute_base_config.rb index ad6273c644..cef0d9266c 100644 --- a/recipes/_compute_base_config.rb +++ b/recipes/_compute_base_config.rb @@ -13,8 +13,6 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -node.default['cfncluster']['cfn_master'] = node['cfncluster']['cfn_master'].split('.')[0] - nfs_master = node['cfncluster']['cfn_master'] # Mount EFS directory with efs_mount recipe diff --git a/recipes/_compute_sge_config.rb b/recipes/_compute_sge_config.rb index 88aaa1ae0b..c37389e6c3 100644 --- a/recipes/_compute_sge_config.rb +++ b/recipes/_compute_sge_config.rb @@ -14,7 +14,7 @@ # limitations under the License. # Mount /opt/sge over NFS -nfs_master = node['cfncluster']['cfn_master'].split('.')[0] +nfs_master = node['cfncluster']['cfn_master'] mount '/opt/sge' do device "#{nfs_master}:/opt/sge" fstype "nfs" diff --git a/recipes/_compute_slurm_config.rb b/recipes/_compute_slurm_config.rb index e23f2deda7..5f056f62c5 100644 --- a/recipes/_compute_slurm_config.rb +++ b/recipes/_compute_slurm_config.rb @@ -14,7 +14,7 @@ # limitations under the License. # Mount /opt/slurm over NFS -nfs_master = node['cfncluster']['cfn_master'].split('.')[0] +nfs_master = node['cfncluster']['cfn_master'] mount '/opt/slurm' do device "#{nfs_master}:/opt/slurm" fstype "nfs" From 4f2585cb1c991257bf247a7159197dfb8ccdf164 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 30 Apr 2019 11:20:51 +0200 Subject: [PATCH 20/50] Use custom chef install URL to resolve Ubuntu dpkg lock issue Issue is reported in https://github.com/chef/bento/issues/609 Using custom chef URL instead of default one (https://www.chef.io/chef/install.sh) we are able to skip the error "dpkg: error: dpkg status database is locked by another process" Signed-off-by: Luca Carrogu --- .kitchen.cloud.yml | 2 + util/chef-install.sh | 793 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 795 insertions(+) create mode 100644 util/chef-install.sh diff --git a/.kitchen.cloud.yml b/.kitchen.cloud.yml index e99174d817..285d47ebc9 100644 --- a/.kitchen.cloud.yml +++ b/.kitchen.cloud.yml @@ -16,6 +16,8 @@ driver_config: provisioner: name: chef_zero require_chef_omnibus: 14.2.0 + # use custom chef install URL to cope with issue https://github.com/chef/bento/issues/609 + chef_omnibus_url: https://raw.githubusercontent.com/aws/aws-parallelcluster-cookbook/develop/util/chef-install.sh retry_on_exit_code: - 35 # 35 is the exit code signaling that the node is rebooting max_retries: 1 diff --git a/util/chef-install.sh b/util/chef-install.sh new file mode 100644 index 0000000000..c7347a08f9 --- /dev/null +++ b/util/chef-install.sh @@ -0,0 +1,793 @@ +#!/bin/sh +# WARNING: REQUIRES /bin/sh +# +# - must run on /bin/sh on solaris 9 +# - must run on /bin/sh on AIX 6.x +# +# Copyright:: Copyright (c) 2010-2015 Chef Software, Inc. +# License:: Apache License, Version 2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# helpers.sh +############ +# This section has some helper functions to make life easier. +# +# Outputs: +# $tmp_dir: secure-ish temp directory that can be used during installation. +############ + +# Check whether a command exists - returns 0 if it does, 1 if it does not +exists() { + if command -v $1 >/dev/null 2>&1 + then + return 0 + else + return 1 + fi +} + +# Output the instructions to report bug about this script +report_bug() { + echo "Version: $version" + echo "" + echo "Please file a Bug Report at https://github.com/chef/omnitruck/issues/new" + echo "Alternatively, feel free to open a Support Ticket at https://www.chef.io/support/tickets" + echo "More Chef support resources can be found at https://www.chef.io/support" + echo "" + echo "Please include as many details about the problem as possible i.e., how to reproduce" + echo "the problem (if possible), type of the Operating System and its version, etc.," + echo "and any other relevant details that might help us with troubleshooting." + echo "" +} + +checksum_mismatch() { + echo "Package checksum mismatch!" + report_bug + exit 1 +} + +unable_to_retrieve_package() { + echo "Unable to retrieve a valid package!" + report_bug + echo "Metadata URL: $metadata_url" + if test "x$download_url" != "x"; then + echo "Download URL: $download_url" + fi + if test "x$stderr_results" != "x"; then + echo "\nDEBUG OUTPUT FOLLOWS:\n$stderr_results" + fi + exit 1 +} + +http_404_error() { + echo "Omnitruck artifact does not exist for version $version on platform $platform" + echo "" + echo "Either this means:" + echo " - We do not support $platform" + echo " - We do not have an artifact for $version" + echo "" + echo "This is often the latter case due to running a prerelease or RC version of chef" + echo "or a gem version which was only pushed to rubygems and not omnitruck." + echo "" + echo "You may be able to set your knife[:bootstrap_version] to the most recent stable" + echo "release of Chef to fix this problem (or the most recent stable major version number)." + echo "" + echo "In order to test the version parameter, adventurous users may take the Metadata URL" + echo "below and modify the '&v=' parameter until you successfully get a URL that" + echo "does not 404 (e.g. via curl or wget). You should be able to use '&v=11' or '&v=12'" + echo "succesfully." + echo "" + echo "If you cannot fix this problem by setting the bootstrap_version, it probably means" + echo "that $platform is not supported." + echo "" + # deliberately do not call report_bug to suppress bug report noise. + echo "Metadata URL: $metadata_url" + if test "x$download_url" != "x"; then + echo "Download URL: $download_url" + fi + if test "x$stderr_results" != "x"; then + echo "\nDEBUG OUTPUT FOLLOWS:\n$stderr_results" + fi + exit 1 +} + +capture_tmp_stderr() { + # spool up /tmp/stderr from all the commands we called + if test -f "$tmp_dir/stderr"; then + output=`cat $tmp_dir/stderr` + stderr_results="${stderr_results}\nSTDERR from $1:\n\n$output\n" + rm $tmp_dir/stderr + fi +} + +# do_wget URL FILENAME +do_wget() { + echo "trying wget..." + wget --user-agent="User-Agent: mixlib-install/3.11.5" -O "$2" "$1" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "ERROR 404" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "wget" + return 1 + fi + + return 0 +} + +# do_curl URL FILENAME +do_curl() { + echo "trying curl..." + curl -A "User-Agent: mixlib-install/3.11.5" --retry 5 -sL -D $tmp_dir/stderr "$1" > "$2" + rc=$? + # check for 404 + grep "404 Not Found" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "curl" + return 1 + fi + + return 0 +} + +# do_fetch URL FILENAME +do_fetch() { + echo "trying fetch..." + fetch --user-agent="User-Agent: mixlib-install/3.11.5" -o "$2" "$1" 2>$tmp_dir/stderr + # check for bad return status + test $? -ne 0 && return 1 + return 0 +} + +# do_perl URL FILENAME +do_perl() { + echo "trying perl..." + perl -e 'use LWP::Simple; getprint($ARGV[0]);' "$1" > "$2" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "404 Not Found" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "perl" + return 1 + fi + + return 0 +} + +# do_python URL FILENAME +do_python() { + echo "trying python..." + python -c "import sys,urllib2; sys.stdout.write(urllib2.urlopen(urllib2.Request(sys.argv[1], headers={ 'User-Agent': 'mixlib-install/3.11.5' })).read())" "$1" > "$2" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "HTTP Error 404" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "python" + return 1 + fi + return 0 +} + +# returns 0 if checksums match +do_checksum() { + if exists sha256sum; then + echo "Comparing checksum with sha256sum..." + checksum=`sha256sum $1 | awk '{ print $1 }'` + return `test "x$checksum" = "x$2"` + elif exists shasum; then + echo "Comparing checksum with shasum..." + checksum=`shasum -a 256 $1 | awk '{ print $1 }'` + return `test "x$checksum" = "x$2"` + else + echo "WARNING: could not find a valid checksum program, pre-install shasum or sha256sum in your O/S image to get valdation..." + return 0 + fi +} + +# do_download URL FILENAME +do_download() { + echo "downloading $1" + echo " to file $2" + + url=`echo $1` + if test "x$platform" = "xsolaris2"; then + if test "x$platform_version" = "x5.9" -o "x$platform_version" = "x5.10"; then + # solaris 9 lacks openssl, solaris 10 lacks recent enough credentials - your base O/S is completely insecure, please upgrade + url=`echo $url | sed -e 's/https/http/'` + fi + fi + + # we try all of these until we get success. + # perl, in particular may be present but LWP::Simple may not be installed + + if exists wget; then + do_wget $url $2 && return 0 + fi + + if exists curl; then + do_curl $url $2 && return 0 + fi + + if exists fetch; then + do_fetch $url $2 && return 0 + fi + + if exists perl; then + do_perl $url $2 && return 0 + fi + + if exists python; then + do_python $url $2 && return 0 + fi + + unable_to_retrieve_package +} + +# install_file TYPE FILENAME +# TYPE is "rpm", "deb", "solaris", "sh", etc. +install_file() { + echo "Installing $project $version" + case "$1" in + "rpm") + if test "x$platform" = "xnexus" || test "x$platform" = "xios_xr"; then + echo "installing with yum..." + yum install -yv "$2" + else + echo "installing with rpm..." + rpm -Uvh --oldpackage --replacepkgs "$2" + fi + ;; + "deb") + echo "installing with dpkg..." + until dpkg -i "$2"; do + echo "Retrying dpkg -i $2 ..." + sleep 1 + done + ;; + "bff") + echo "installing with installp..." + installp -aXYgd "$2" all + ;; + "solaris") + echo "installing with pkgadd..." + echo "conflict=nocheck" > $tmp_dir/nocheck + echo "action=nocheck" >> $tmp_dir/nocheck + echo "mail=" >> $tmp_dir/nocheck + pkgrm -a $tmp_dir/nocheck -n $project >/dev/null 2>&1 || true + pkgadd -G -n -d "$2" -a $tmp_dir/nocheck $project + ;; + "pkg") + echo "installing with installer..." + cd / && /usr/sbin/installer -pkg "$2" -target / + ;; + "dmg") + echo "installing dmg file..." + hdiutil detach "/Volumes/chef_software" >/dev/null 2>&1 || true + hdiutil attach "$2" -mountpoint "/Volumes/chef_software" + cd / && /usr/sbin/installer -pkg `find "/Volumes/chef_software" -name \*.pkg` -target / + hdiutil detach "/Volumes/chef_software" + ;; + "sh" ) + echo "installing with sh..." + sh "$2" + ;; + "p5p" ) + echo "installing p5p package..." + pkg install -g "$2" $project + ;; + *) + echo "Unknown filetype: $1" + report_bug + exit 1 + ;; + esac + if test $? -ne 0; then + echo "Installation failed" + report_bug + exit 1 + fi +} + +if test "x$TMPDIR" = "x"; then + tmp="/tmp" +else + tmp=$TMPDIR +fi +# secure-ish temp dir creation without having mktemp available (DDoS-able but not expliotable) +tmp_dir="$tmp/install.sh.$$" +(umask 077 && mkdir $tmp_dir) || exit 1 + +############ +# end of helpers.sh +############ + + +# script_cli_parameters.sh +############ +# This section reads the CLI parameters for the install script and translates +# them to the local parameters to be used later by the script. +# +# Outputs: +# $version: Requested version to be installed. +# $channel: Channel to install the product from +# $project: Project to be installed +# $cmdline_filename: Name of the package downloaded on local disk. +# $cmdline_dl_dir: Name of the directory downloaded package will be saved to on local disk. +# $install_strategy: Method of package installations. default strategy is to always install upon exec. Set to "once" to skip if project is installed +# $download_url_override: Install package downloaded from a direct URL. +# $checksum: SHA256 for download_url_override file (optional) +############ + +# Defaults +channel="stable" +project="chef" + +while getopts pnv:c:f:P:d:s:l:a opt +do + case "$opt" in + + v) version="$OPTARG";; + c) channel="$OPTARG";; + p) channel="current";; # compat for prerelease option + n) channel="current";; # compat for nightlies option + f) cmdline_filename="$OPTARG";; + P) project="$OPTARG";; + d) cmdline_dl_dir="$OPTARG";; + s) install_strategy="$OPTARG";; + l) download_url_override="$OPTARG";; + a) checksum="$OPTARG";; + \?) # unknown flag + echo >&2 \ + "usage: $0 [-P project] [-c release_channel] [-v version] [-f filename | -d download_dir] [-s install_strategy] [-l download_url_override] [-a checksum]" + exit 1;; + esac +done + +shift `expr $OPTIND - 1` + + +if test -d "/opt/$project" && test "x$install_strategy" = "xonce"; then + echo "$project installation detected" + echo "install_strategy set to 'once'" + echo "Nothing to install" + exit +fi + + +# platform_detection.sh +############ +# This section makes platform detection compatible with omnitruck on the system +# it runs. +# +# Outputs: +# $platform: Name of the platform. +# $platform_version: Version of the platform. +# $machine: System's architecture. +############ + +# +# Platform and Platform Version detection +# +# NOTE: This should now match ohai platform and platform_version matching. +# do not invented new platform and platform_version schemas, just make this behave +# like what ohai returns as platform and platform_version for the server. +# +# ALSO NOTE: Do not mangle platform or platform_version here. It is less error +# prone and more future-proof to do that in the server, and then all omnitruck clients +# will 'inherit' the changes (install.sh is not the only client of the omnitruck +# endpoint out there). +# + +machine=`uname -m` +os=`uname -s` + +if test -f "/etc/lsb-release" && grep -q DISTRIB_ID /etc/lsb-release && ! grep -q wrlinux /etc/lsb-release; then + platform=`grep DISTRIB_ID /etc/lsb-release | cut -d "=" -f 2 | tr '[A-Z]' '[a-z]'` + platform_version=`grep DISTRIB_RELEASE /etc/lsb-release | cut -d "=" -f 2` + + if test "$platform" = "\"cumulus linux\""; then + platform="cumulus_linux" + elif test "$platform" = "\"cumulus networks\""; then + platform="cumulus_networks" + fi + +elif test -f "/etc/debian_version"; then + platform="debian" + platform_version=`cat /etc/debian_version` +elif test -f "/etc/Eos-release"; then + # EOS may also contain /etc/redhat-release so this check must come first. + platform=arista_eos + platform_version=`awk '{print $4}' /etc/Eos-release` + machine="i386" +elif test -f "/etc/redhat-release"; then + platform=`sed 's/^\(.\+\) release.*/\1/' /etc/redhat-release | tr '[A-Z]' '[a-z]'` + platform_version=`sed 's/^.\+ release \([.0-9]\+\).*/\1/' /etc/redhat-release` + + if test "$platform" = "xenserver"; then + # Current XenServer 6.2 is based on CentOS 5, platform is not reset to "el" server should hanlde response + platform="xenserver" + else + # FIXME: use "redhat" + platform="el" + fi + +elif test -f "/etc/system-release"; then + platform=`sed 's/^\(.\+\) release.\+/\1/' /etc/system-release | tr '[A-Z]' '[a-z]'` + platform_version=`sed 's/^.\+ release \([.0-9]\+\).*/\1/' /etc/system-release | tr '[A-Z]' '[a-z]'` + case $platform in amazon*) # sh compat method of checking for a substring + platform="el" + + . /etc/os-release + platform_version=$VERSION_ID + if test "$platform_version" = "2"; then + platform_version="7" + else + # VERSION_ID will match YYYY.MM for Amazon Linux AMIs + platform_version="6" + fi + esac + +# Apple OS X +elif test -f "/usr/bin/sw_vers"; then + platform="mac_os_x" + # Matching the tab-space with sed is error-prone + platform_version=`sw_vers | awk '/^ProductVersion:/ { print $2 }' | cut -d. -f1,2` + + # x86_64 Apple hardware often runs 32-bit kernels (see OHAI-63) + x86_64=`sysctl -n hw.optional.x86_64` + if test $x86_64 -eq 1; then + machine="x86_64" + fi +elif test -f "/etc/release"; then + machine=`/usr/bin/uname -p` + if grep -q SmartOS /etc/release; then + platform="smartos" + platform_version=`grep ^Image /etc/product | awk '{ print $3 }'` + else + platform="solaris2" + platform_version=`/usr/bin/uname -r` + fi +elif test -f "/etc/SuSE-release"; then + if grep -q 'Enterprise' /etc/SuSE-release; + then + platform="sles" + platform_version=`awk '/^VERSION/ {V = $3}; /^PATCHLEVEL/ {P = $3}; END {print V "." P}' /etc/SuSE-release` + else + platform="suse" + platform_version=`awk '/^VERSION =/ { print $3 }' /etc/SuSE-release` + fi +elif test "x$os" = "xFreeBSD"; then + platform="freebsd" + platform_version=`uname -r | sed 's/-.*//'` +elif test "x$os" = "xAIX"; then + platform="aix" + platform_version="`uname -v`.`uname -r`" + machine="powerpc" +elif test -f "/etc/os-release"; then + . /etc/os-release + if test "x$CISCO_RELEASE_INFO" != "x"; then + . $CISCO_RELEASE_INFO + fi + + platform=$ID + platform_version=$VERSION +fi + +if test "x$platform" = "x"; then + echo "Unable to determine platform version!" + report_bug + exit 1 +fi + +# +# NOTE: platform manging in the install.sh is DEPRECATED +# +# - install.sh should be true to ohai and should not remap +# platform or platform versions. +# +# - remapping platform and mangling platform version numbers is +# now the complete responsibility of the server-side endpoints +# + +major_version=`echo $platform_version | cut -d. -f1` +case $platform in + # FIXME: should remove this case statement completely + "el") + # FIXME: "el" is deprecated, should use "redhat" + platform_version=$major_version + ;; + "debian") + if test "x$major_version" = "x5"; then + # This is here for potential back-compat. + # We do not have 5 in versions we publish for anymore but we + # might have it for earlier versions. + platform_version="6" + else + platform_version=$major_version + fi + ;; + "freebsd") + platform_version=$major_version + ;; + "sles") + platform_version=$major_version + ;; + "suse") + platform_version=$major_version + ;; +esac + +# normalize the architecture we detected +case $machine in + "x86_64"|"amd64"|"x64") + machine="x86_64" + ;; + "i386"|"i86pc"|"x86"|"i686") + machine="i386" + ;; + "sparc"|"sun4u"|"sun4v") + machine="sparc" + ;; +esac + +if test "x$platform_version" = "x"; then + echo "Unable to determine platform version!" + report_bug + exit 1 +fi + +if test "x$platform" = "xsolaris2"; then + # hack up the path on Solaris to find wget, pkgadd + PATH=/usr/sfw/bin:/usr/sbin:$PATH + export PATH +fi + +echo "$platform $platform_version $machine" + +############ +# end of platform_detection.sh +############ + + +# All of the download utilities in this script load common proxy env vars. +# If variables are set they will override any existing env vars. +# Otherwise, default proxy env vars will be loaded by the respective +# download utility. + +if test "x$https_proxy" != "x"; then + echo "setting https_proxy: $https_proxy" + export HTTPS_PROXY=$https_proxy + export https_proxy=$https_proxy +fi + +if test "x$http_proxy" != "x"; then + echo "setting http_proxy: $http_proxy" + export HTTP_PROXY=$http_proxy + export http_proxy=$http_proxy +fi + +if test "x$ftp_proxy" != "x"; then + echo "setting ftp_proxy: $ftp_proxy" + export FTP_PROXY=$ftp_proxy + export ftp_proxy=$ftp_proxy +fi + +if test "x$no_proxy" != "x"; then + echo "setting no_proxy: $no_proxy" + export NO_PROXY=$no_proxy + export no_proxy=$no_proxy +fi + + +# fetch_metadata.sh +############ +# This section calls omnitruck to get the information about the build to be +# installed. +# +# Inputs: +# $channel: +# $project: +# $version: +# $platform: +# $platform_version: +# $machine: +# $tmp_dir: +# +# Outputs: +# $download_url: +# $sha256: +############ + +if test "x$download_url_override" = "x"; then + echo "Getting information for $project $channel $version for $platform..." + + metadata_filename="$tmp_dir/metadata.txt" + metadata_url="https://www.chef.io/$channel/$project/metadata?v=$version&p=$platform&pv=$platform_version&m=$machine" + + do_download "$metadata_url" "$metadata_filename" + + cat "$metadata_filename" + + echo "" + # check that all the mandatory fields in the downloaded metadata are there + if grep '^url' $metadata_filename > /dev/null && grep '^sha256' $metadata_filename > /dev/null; then + echo "downloaded metadata file looks valid..." + else + echo "downloaded metadata file is corrupted or an uncaught error was encountered in downloading the file..." + # this generally means one of the download methods downloaded a 404 or something like that and then reported a successful exit code, + # and this should be fixed in the function that was doing the download. + report_bug + exit 1 + fi + + download_url=`awk '$1 == "url" { print $2 }' "$metadata_filename"` + sha256=`awk '$1 == "sha256" { print $2 }' "$metadata_filename"` +else + download_url=$download_url_override + # Set sha256 to empty string if checksum not set + sha256=${checksum=""} +fi + +############ +# end of fetch_metadata.sh +############ + + +# fetch_package.sh +############ +# This section fetchs a package from $download_url and verifies its metadata. +# +# Inputs: +# $download_url: +# $tmp_dir: +# Optional Inputs: +# $cmdline_filename: Name of the package downloaded on local disk. +# $cmdline_dl_dir: Name of the directory downloaded package will be saved to on local disk. +# +# Outputs: +# $download_filename: Name of the downloaded file on local disk. +# $filetype: Type of the file downloaded. +############ + +filename=`echo $download_url | sed -e 's/^.*\///'` +filetype=`echo $filename | sed -e 's/^.*\.//'` + +# use either $tmp_dir, the provided directory (-d) or the provided filename (-f) +if test "x$cmdline_filename" != "x"; then + download_filename="$cmdline_filename" +elif test "x$cmdline_dl_dir" != "x"; then + download_filename="$cmdline_dl_dir/$filename" +else + download_filename="$tmp_dir/$filename" +fi + +# ensure the parent directory where to download the installer always exists +download_dir=`dirname $download_filename` +(umask 077 && mkdir -p $download_dir) || exit 1 + +# check if we have that file locally available and if so verify the checksum +# Use cases +# 1) metadata - new download +# 2) metadata - cached download when cmdline_dl_dir set +# 3) url override - no checksum new download +# 4) url override - with checksum new download +# 5) url override - with checksum cached download when cmdline_dl_dir set + +cached_file_available="false" +verify_checksum="true" + +if test -f $download_filename; then + echo "$download_filename exists" + cached_file_available="true" +fi + +if test "x$download_url_override" != "x"; then + echo "Download URL override specified" + if test "x$cached_file_available" = "xtrue"; then + echo "Verifying local file" + if test "x$sha256" = "x"; then + echo "Checksum not specified, ignoring existing file" + cached_file_available="false" # download new file + verify_checksum="false" # no checksum to compare after download + elif do_checksum "$download_filename" "$sha256"; then + echo "Checksum match, using existing file" + cached_file_available="true" # don't need to download file + verify_checksum="false" # don't need to checksum again + else + echo "Checksum mismatch, ignoring existing file" + cached_file_available="false" # download new file + verify_checksum="true" # checksum new downloaded file + fi + else + echo "$download_filename not found" + cached_file_available="false" # download new file + if test "x$sha256" = "x"; then + verify_checksum="false" # no checksum to compare after download + else + verify_checksum="true" # checksum new downloaded file + fi + fi +fi + +if test "x$cached_file_available" != "xtrue"; then + do_download "$download_url" "$download_filename" +fi + +if test "x$verify_checksum" = "xtrue"; then + do_checksum "$download_filename" "$sha256" || checksum_mismatch +fi + +############ +# end of fetch_package.sh +############ + + +# install_package.sh +############ +# Installs a package and removed the temp directory. +# +# Inputs: +# $download_filename: Name of the file to be installed. +# $filetype: Type of the file to be installed. +# $version: The version requested. Used only for warning user if not set. +############ + +if test "x$version" = "x" -a "x$CI" != "xtrue"; then + echo + echo "WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING" + echo + echo "You are installing an omnibus package without a version pin. If you are installing" + echo "on production servers via an automated process this is DANGEROUS and you will" + echo "be upgraded without warning on new releases, even to new major releases." + echo "Letting the version float is only appropriate in desktop, test, development or" + echo "CI/CD environments." + echo + echo "WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING" + echo +fi + +install_file $filetype "$download_filename" + +if test "x$tmp_dir" != "x"; then + rm -r "$tmp_dir" +fi + +############ +# end of install_package.sh +############ From 184c0379440bddafb53c3c33842f618c237bda6b Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 30 Apr 2019 12:16:51 +0200 Subject: [PATCH 21/50] Skip SGE installation on compute node SGE installation folder is mounted from master node The installation is done when "cfn_node_type" is "MasterServer" (at runtime) or is "nil" (at packer time) Signed-off-by: Luca Carrogu --- attributes/default.rb | 1 - recipes/sge_config.rb | 1 - recipes/sge_install.rb | 175 +++++++++++++++++++++++------------------ 3 files changed, 98 insertions(+), 79 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index d16a68eb09..74753fb4b0 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -166,7 +166,6 @@ default['cfncluster']['cfn_shared_dir'] = '/shared' default['cfncluster']['cfn_efs_shared_dir'] = 'NONE' default['cfncluster']['cfn_efs'] = nil -default['cfncluster']['cfn_node_type'] = nil default['cfncluster']['cfn_master'] = nil default['cfncluster']['cfn_cluster_user'] = 'ec2-user' default['cfncluster']['cfn_fsx_options'] = 'NONE' diff --git a/recipes/sge_config.rb b/recipes/sge_config.rb index 233497cf13..eb48a64656 100644 --- a/recipes/sge_config.rb +++ b/recipes/sge_config.rb @@ -16,7 +16,6 @@ include_recipe 'aws-parallelcluster::base_config' include_recipe 'aws-parallelcluster::sge_install' -# case node['cfncluster']['cfn_node_type'] case node['cfncluster']['cfn_node_type'] when 'MasterServer' include_recipe 'aws-parallelcluster::_master_sge_config' diff --git a/recipes/sge_install.rb b/recipes/sge_install.rb index 929420f048..0da68929e2 100644 --- a/recipes/sge_install.rb +++ b/recipes/sge_install.rb @@ -15,92 +15,113 @@ include_recipe 'aws-parallelcluster::base_install' -sge_tarball = "#{node['cfncluster']['sources_dir']}/sge-#{node['cfncluster']['sge']['version']}.tar.gz" +case node['cfncluster']['cfn_node_type'] +when 'MasterServer', nil + sge_tarball = "#{node['cfncluster']['sources_dir']}/sge-#{node['cfncluster']['sge']['version']}.tar.gz" -# Get SGE tarball -remote_file sge_tarball do - source node['cfncluster']['sge']['url'] - mode '0644' - retries 3 - retry_delay 5 - # TODO: Add version or checksum checks - not_if { ::File.exist?(sge_tarball) } -end + # Get SGE tarball + remote_file sge_tarball do + source node['cfncluster']['sge']['url'] + mode '0644' + retries 3 + retry_delay 5 + # TODO: Add version or checksum checks + not_if { ::File.exist?(sge_tarball) } + end -# Install SGE -bash 'make install' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - environment 'SGE_ROOT' => '/opt/sge' - code <<-SGE - tar xf #{sge_tarball} - cd sge-#{node['cfncluster']['sge']['version']}/source - CORES=$(grep processor /proc/cpuinfo | wc -l) - sh scripts/bootstrap.sh -no-java -no-jni -no-herd - ./aimk -pam -no-remote -no-java -no-jni -no-herd -parallel $CORES - ./aimk -man -no-java -no-jni -no-herd -parallel $CORES - scripts/distinst -local -allall -noexit - mkdir $SGE_ROOT - echo instremote=false >> distinst.private - gearch=`dist/util/arch` - echo 'y'| scripts/distinst -local -allall ${gearch} - SGE - # TODO: Fix, so it works for upgrade - creates '/opt/sge/bin/lx-amd64/sge_qmaster' -end + # Install SGE + bash 'make install' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + environment 'SGE_ROOT' => '/opt/sge' + code <<-SGE + tar xf #{sge_tarball} + cd sge-#{node['cfncluster']['sge']['version']}/source + CORES=$(grep processor /proc/cpuinfo | wc -l) + sh scripts/bootstrap.sh -no-java -no-jni -no-herd + ./aimk -pam -no-remote -no-java -no-jni -no-herd -parallel $CORES + ./aimk -man -no-java -no-jni -no-herd -parallel $CORES + scripts/distinst -local -allall -noexit + mkdir $SGE_ROOT + echo instremote=false >> distinst.private + gearch=`dist/util/arch` + echo 'y'| scripts/distinst -local -allall ${gearch} + SGE + # TODO: Fix, so it works for upgrade + creates '/opt/sge/bin/lx-amd64/sge_qmaster' + end -# Copy qconf utils (Downloaded from http://arc.liv.ac.uk/SGE/downloads/qconf_scripts.tar.gz) -cookbook_file 'qconf_scripts.tar.gz' do - path '/opt/sge/util/qconf_scripts.tar.gz' - user 'root' - group 'root' - mode '0644' -end + # Copy qconf utils (Downloaded from http://arc.liv.ac.uk/SGE/downloads/qconf_scripts.tar.gz) + cookbook_file 'qconf_scripts.tar.gz' do + path '/opt/sge/util/qconf_scripts.tar.gz' + user 'root' + group 'root' + mode '0644' + end -bash "extract_qconf_util" do - code <<-EXTRACTQCONFUTIL - tar xf /opt/sge/util/qconf_scripts.tar.gz -C /opt/sge/util --strip-components=1 --no-same-permissions --no-same-owner - EXTRACTQCONFUTIL -end + bash "extract_qconf_util" do + code <<-EXTRACTQCONFUTIL + tar xf /opt/sge/util/qconf_scripts.tar.gz -C /opt/sge/util --strip-components=1 --no-same-permissions --no-same-owner + EXTRACTQCONFUTIL + end -# Disbale the AddQueue, so that we can manage slots per instance -replace_or_add "AddQueue" do - path "/opt/sge/inst_sge" - pattern "AddQueue" - line "#AddQueue" -end + # Disbale the AddQueue, so that we can manage slots per instance + replace_or_add "AddQueue" do + path "/opt/sge/inst_sge" + pattern "AddQueue" + line "#AddQueue" + end -# Only on CentOS/RHEL7 update the initd -if node['platform_family'] == 'rhel' && node['platform_version'].to_i >= 7 && node['platform'] != 'amazon' - execute 'sed' do - command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgemaster_template' + # Only on CentOS/RHEL7 update the initd + if node['platform_family'] == 'rhel' && node['platform_version'].to_i >= 7 && node['platform'] != 'amazon' + execute 'sed' do + command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgemaster_template' + end + execute 'sed' do + command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgeexecd_template' + end end - execute 'sed' do - command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgeexecd_template' + + # Setup sgeadmin user + user "sgeadmin" do + manage_home true + comment 'sgeadmin user' + home "/home/sgeadmin" + system true + shell '/bin/bash' end -end -# Setup sgeadmin user -user "sgeadmin" do - manage_home true - comment 'sgeadmin user' - home "/home/sgeadmin" - system true - shell '/bin/bash' -end + # Copy required licensing files + directory "#{node['cfncluster']['license_dir']}/sge" -# Copy required licensing files -directory "#{node['cfncluster']['license_dir']}/sge" + bash 'copy license stuff' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SGELICENSE + cd sge-#{node['cfncluster']['sge']['version']}/LICENCES + cp -v SISSL #{node['cfncluster']['license_dir']}/sge/SISSL + SGELICENSE + # TODO: Fix, so it works for upgrade + creates "#{node['cfncluster']['license_dir']}/sge/SISSL" + end +when 'ComputeFleet' + # Created SGE shared mount point + directory "/opt/sge" do + mode '1777' + owner 'root' + group 'root' + action :create + end -bash 'copy license stuff' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SGELICENSE - cd sge-#{node['cfncluster']['sge']['version']}/LICENCES - cp -v SISSL #{node['cfncluster']['license_dir']}/sge/SISSL - SGELICENSE - # TODO: Fix, so it works for upgrade - creates "#{node['cfncluster']['license_dir']}/sge/SISSL" + # Setup sgeadmin user without creating the home (mounted from master) + user "sgeadmin" do + manage_home false + comment 'sgeadmin user' + home "/home/sgeadmin" + system true + shell '/bin/bash' + end end + From 838d75c8bbac354223dc79769d38cf7e8f092084 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 30 Apr 2019 12:39:58 +0200 Subject: [PATCH 22/50] Skip Slurm installation on compute node Slurm installation folder is mounted from master node The installation is done when "cfn_node_type" is "MasterServer" (at runtime) or is "nil" (at packer time) Signed-off-by: Luca Carrogu --- recipes/slurm_config.rb | 1 - recipes/slurm_install.rb | 124 +++++++++++++++++++++++---------------- 2 files changed, 72 insertions(+), 53 deletions(-) diff --git a/recipes/slurm_config.rb b/recipes/slurm_config.rb index 5ae6499ff7..6786bc9154 100644 --- a/recipes/slurm_config.rb +++ b/recipes/slurm_config.rb @@ -37,7 +37,6 @@ only_if { node['init_package'] != 'systemd' } end -# case node['cfncluster']['cfn_node_type'] case node['cfncluster']['cfn_node_type'] when 'MasterServer' include_recipe 'aws-parallelcluster::_master_slurm_config' diff --git a/recipes/slurm_install.rb b/recipes/slurm_install.rb index 004be71281..a0720c7e94 100644 --- a/recipes/slurm_install.rb +++ b/recipes/slurm_install.rb @@ -16,43 +16,81 @@ include_recipe 'aws-parallelcluster::base_install' include_recipe 'aws-parallelcluster::munge_install' -slurm_tarball = "#{node['cfncluster']['sources_dir']}/slurm-#{node['cfncluster']['slurm']['version']}.tar.gz" +case node['cfncluster']['cfn_node_type'] +when 'MasterServer', nil + slurm_tarball = "#{node['cfncluster']['sources_dir']}/slurm-#{node['cfncluster']['slurm']['version']}.tar.gz" -# Get slurm tarball -remote_file slurm_tarball do - source node['cfncluster']['slurm']['url'] - mode '0644' - retries 3 - retry_delay 5 - # TODO: Add version or checksum checks - not_if { ::File.exist?(slurm_tarball) } -end + # Get slurm tarball + remote_file slurm_tarball do + source node['cfncluster']['slurm']['url'] + mode '0644' + retries 3 + retry_delay 5 + # TODO: Add version or checksum checks + not_if { ::File.exist?(slurm_tarball) } + end -# Install Slurm -bash 'make install' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SLURM - tar xf #{slurm_tarball} - cd slurm-slurm-#{node['cfncluster']['slurm']['version']} - ./configure --prefix=/opt/slurm - CORES=$(grep processor /proc/cpuinfo | wc -l) - make -j $CORES - make install - make install-contrib - SLURM - # TODO: Fix, so it works for upgrade - creates '/opt/slurm/bin/srun' -end + # Install Slurm + bash 'make install' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SLURM + tar xf #{slurm_tarball} + cd slurm-slurm-#{node['cfncluster']['slurm']['version']} + ./configure --prefix=/opt/slurm + CORES=$(grep processor /proc/cpuinfo | wc -l) + make -j $CORES + make install + make install-contrib + SLURM + # TODO: Fix, so it works for upgrade + creates '/opt/slurm/bin/srun' + end + + # Setup slurm user + user "slurm" do + manage_home true + comment 'slurm user' + home "/home/slurm" + system true + shell '/bin/bash' + end + + # Copy required licensing files + directory "#{node['cfncluster']['license_dir']}/slurm" -# Setup slurm user -user "slurm" do - manage_home true - comment 'slurm user' - home "/home/slurm" - system true - shell '/bin/bash' + bash 'copy license stuff' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SLURMLICENSE + cd slurm-slurm-#{node['cfncluster']['slurm']['version']} + cp -v COPYING #{node['cfncluster']['license_dir']}/slurm/COPYING + cp -v DISCLAIMER #{node['cfncluster']['license_dir']}/slurm/DISCLAIMER + cp -v LICENSE.OpenSSL #{node['cfncluster']['license_dir']}/slurm/LICENSE.OpenSSL + cp -v README.rst #{node['cfncluster']['license_dir']}/slurm/README.rst + SLURMLICENSE + # TODO: Fix, so it works for upgrade + creates "#{node['cfncluster']['license_dir']}/slurm/README.rst" + end +when 'ComputeFleet' + # Created Slurm shared mount point + directory "/opt/slurm" do + mode '1777' + owner 'root' + group 'root' + action :create + end + + # Setup slurm user without creating the home (mounted from master) + user "slurm" do + manage_home false + comment 'slurm user' + home "/home/slurm" + system true + shell '/bin/bash' + end end cookbook_file '/etc/init.d/slurm' do @@ -63,21 +101,3 @@ action :create only_if { node['platform_family'] == 'debian' && !node['init_package'] == 'systemd' } end - -# Copy required licensing files -directory "#{node['cfncluster']['license_dir']}/slurm" - -bash 'copy license stuff' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SLURMLICENSE - cd slurm-slurm-#{node['cfncluster']['slurm']['version']} - cp -v COPYING #{node['cfncluster']['license_dir']}/slurm/COPYING - cp -v DISCLAIMER #{node['cfncluster']['license_dir']}/slurm/DISCLAIMER - cp -v LICENSE.OpenSSL #{node['cfncluster']['license_dir']}/slurm/LICENSE.OpenSSL - cp -v README.rst #{node['cfncluster']['license_dir']}/slurm/README.rst - SLURMLICENSE - # TODO: Fix, so it works for upgrade - creates "#{node['cfncluster']['license_dir']}/slurm/README.rst" -end From fe0dff70c5fc3a39ed6ae4a3f3c414455100f692 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 1 May 2019 14:12:27 +0200 Subject: [PATCH 23/50] Slurm: decrease SlurmdTimeout to 120 seconds SlurmdTimeout: the interval, in seconds, that the Slurm controller waits for slurmd to respond before configuring that node's state to DOWN. Reducing it in order to have a faster reaction to nodes that are failing. Signed-off-by: Francesco De Martino --- templates/default/slurm.conf.erb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/default/slurm.conf.erb b/templates/default/slurm.conf.erb index ad049832a5..9bdee0e513 100644 --- a/templates/default/slurm.conf.erb +++ b/templates/default/slurm.conf.erb @@ -55,7 +55,7 @@ ReturnToService=1 # # TIMERS SlurmctldTimeout=300 -SlurmdTimeout=300 +SlurmdTimeout=120 InactiveLimit=0 MinJobAge=300 KillWait=30 From 402947adbe9e7a44c7febebebd9b4d6c8c10779a Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 30 Apr 2019 15:29:08 +0200 Subject: [PATCH 24/50] Add finalize recipe for supervisord and compute_ready move supervisord start at the end of the user data in a finalize chef recipe. This solves the problem of the nodewatcher that was started before the end of chef recipes and post_install script and therefore the idletime was being mistakenly computed Signed-off-by: Francesco De Martino --- .kitchen.yml | 6 ++++++ files/default/compute_ready | 1 + recipes/base_config.rb | 6 ------ recipes/finalize.rb | 25 +++++++++++++++++++++++++ 4 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 recipes/finalize.rb diff --git a/.kitchen.yml b/.kitchen.yml index 92f58f4b1b..7fe74645f1 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -75,6 +75,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::sge_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -96,6 +97,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::torque_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -117,6 +119,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::slurm_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -138,6 +141,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::sge_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -159,6 +163,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::torque_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -180,6 +185,7 @@ suites: run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::slurm_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: diff --git a/files/default/compute_ready b/files/default/compute_ready index f557feb36a..3fa2dc53af 100644 --- a/files/default/compute_ready +++ b/files/default/compute_ready @@ -1,4 +1,5 @@ #!/bin/bash +set -e . /etc/parallelcluster/cfnconfig diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 9f67ab5f77..509a5f599e 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -53,12 +53,6 @@ mode '0644' end -# Restart supervisord -service "supervisord" do - supports restart: true - action %i[enable start] -end - # Only run FSx on centos for now if node['platform'] == 'centos' or node['platform'] == 'amazon' # Mount FSx diff --git a/recipes/finalize.rb b/recipes/finalize.rb new file mode 100644 index 0000000000..883d0ad606 --- /dev/null +++ b/recipes/finalize.rb @@ -0,0 +1,25 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: finalize +# +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +# Restart supervisord +service "supervisord" do + supports restart: true + action %i[enable start] +end + +execute "compute_ready" do + command "/opt/parallelcluster/scripts/compute_ready" + only_if { node['cfncluster']['cfn_node_type'] == 'ComputeFleet' } +end From 99c3a2814b9b8762ba1f40d6afaaa0689a9da343 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 13 May 2019 12:46:21 +0200 Subject: [PATCH 25/50] Add PATH for aws cli when executing compute_ready PATH is normally set in cfn userdata. In order to have chef recipes independent from userdata I'm setting explicitly the PATH for this command. Signed-off-by: Francesco De Martino --- recipes/finalize.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/finalize.rb b/recipes/finalize.rb index 883d0ad606..28504afea9 100644 --- a/recipes/finalize.rb +++ b/recipes/finalize.rb @@ -21,5 +21,6 @@ execute "compute_ready" do command "/opt/parallelcluster/scripts/compute_ready" + environment('PATH' => '/usr/local/bin:/usr/bin/:$PATH') only_if { node['cfncluster']['cfn_node_type'] == 'ComputeFleet' } end From b2755e03209a83c4637231bcff3660e194815f26 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 14 May 2019 11:10:18 +0200 Subject: [PATCH 26/50] Double the retry limit to cope with the throttling error "Request limit exceeded" Doc https://docs.aws.amazon.com/sdkforruby/api/Aws/ConfigService/Client.html Signed-off-by: Luca Carrogu --- .kitchen.cloud.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.kitchen.cloud.yml b/.kitchen.cloud.yml index 285d47ebc9..443e688abd 100644 --- a/.kitchen.cloud.yml +++ b/.kitchen.cloud.yml @@ -2,6 +2,7 @@ driver_config: retryable_sleep: 15 retryable_tries: 20 + retry_limit: 6 aws_ssh_key_id: <%= ENV['AWS_KEYPAIR_NAME'] %> region: <%= ENV['AWS_DEFAULT_REGION'] %> instance_type: <%= ENV['AWS_FLAVOR_ID'] %> From 619b3d12bc0e8fa36e33d19edd05231a7befd183 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 23 May 2019 13:09:36 +0200 Subject: [PATCH 27/50] Change download URL for CloudFormation Helper Scripts package This will add support for Ubuntu in China NorthWest region (cn-northwest-1) Signed-off-by: Luca Carrogu --- CHANGELOG.md | 7 +++++++ amis/packer_centos6.json | 2 +- amis/packer_centos7.json | 2 +- amis/packer_ubuntu1404.json | 2 +- amis/packer_ubuntu1604.json | 2 +- 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ede6861f6d..576fcf92ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +2.4.0 +----- + +**ENHANCEMENTS** + +- Add support for Ubuntu in China region `cn-northwest-1` + 2.3.1 ----- diff --git a/amis/packer_centos6.json b/amis/packer_centos6.json index 9d73a41fed..bef51b0efc 100644 --- a/amis/packer_centos6.json +++ b/amis/packer_centos6.json @@ -251,7 +251,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] diff --git a/amis/packer_centos7.json b/amis/packer_centos7.json index 80bd33ed75..a403d86465 100644 --- a/amis/packer_centos7.json +++ b/amis/packer_centos7.json @@ -256,7 +256,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "which pip2", "if [ $? -eq 0 ]; then sudo pip2 install /tmp/aws-cfn-bootstrap-latest.tar.gz; else sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz; fi" diff --git a/amis/packer_ubuntu1404.json b/amis/packer_ubuntu1404.json index 5630cc8156..a04213d51f 100644 --- a/amis/packer_ubuntu1404.json +++ b/amis/packer_ubuntu1404.json @@ -257,7 +257,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] diff --git a/amis/packer_ubuntu1604.json b/amis/packer_ubuntu1604.json index 4fb81b4e2e..4adcb2d0e7 100644 --- a/amis/packer_ubuntu1604.json +++ b/amis/packer_ubuntu1604.json @@ -260,7 +260,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] From 46efa520b5e28d218f18da7eca1638176d29ea05 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 23 May 2019 17:55:53 +0200 Subject: [PATCH 28/50] Bump version to 2.4.0 Signed-off-by: Luca Carrogu --- amis/packer_variables.json | 4 ++-- metadata.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/amis/packer_variables.json b/amis/packer_variables.json index f12ba551c2..ca032b508c 100644 --- a/amis/packer_variables.json +++ b/amis/packer_variables.json @@ -1,6 +1,6 @@ { - "parallelcluster_version": "2.3.2a1", - "parallelcluster_cookbook_version": "2.3.2", + "parallelcluster_version": "2.4.0", + "parallelcluster_cookbook_version": "2.4.0", "chef_version": "14.2.0", "ridley_version": "5.1.1", "berkshelf_version": "7.0.4" diff --git a/metadata.rb b/metadata.rb index 38981109fc..8e3111b763 100644 --- a/metadata.rb +++ b/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '14.2.0' -version '2.3.2' +version '2.4.0' supports 'amazon' supports 'centos', '= 6' From f9977c765df2177d4ede024a4bf477d30347c12f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Sat, 25 May 2019 11:27:56 +0200 Subject: [PATCH 29/50] Bump version to 2.4.0 Signed-off-by: Luca Carrogu --- attributes/default.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index 74753fb4b0..b7e9b5b446 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -19,8 +19,8 @@ default['cfncluster']['scripts_dir'] = "#{node['cfncluster']['base_dir']}/scripts" default['cfncluster']['license_dir'] = "#{node['cfncluster']['base_dir']}/licenses" # Python packages -default['cfncluster']['cfncluster-version'] = '2.3.2a1' -default['cfncluster']['cfncluster-node-version'] = '2.3.2a1' +default['cfncluster']['cfncluster-version'] = '2.4.0' +default['cfncluster']['cfncluster-node-version'] = '2.4.0' default['cfncluster']['supervisor-version'] = '3.4.0' # URLs to software packages used during install recipes # Gridengine software From 4d368b52599cc1f5e152561c6c2de01828f81203 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 24 May 2019 18:06:17 +0200 Subject: [PATCH 30/50] sge: configure scheduler behaviour for unknown hosts Signed-off-by: Francesco De Martino --- recipes/_master_sge_config.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/recipes/_master_sge_config.rb b/recipes/_master_sge_config.rb index 47b12ece4f..301c69296a 100644 --- a/recipes/_master_sge_config.rb +++ b/recipes/_master_sge_config.rb @@ -73,3 +73,17 @@ /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_FORCED_QDEL global ENABLEFORCEDQDEL end + +# max_unheard: host is set to unknown after being unresponsive for the configured timeout +# reschedule_unknown: jobs on hosts in an unknown state are rescheduled/deleted after the configured timeout +# ENABLE_FORCED_QDEL_IF_UNKNOWN: force deletion on qdel command for hosts in unknown state +# ENABLE_RESCHEDULE_KILL: reschedule_unknown parameter affects also jobs which have the rerun flag not activated +bash "configure_unknown_hosts_behaviour" do + code <<-CONFIGUNKNOWN + . /opt/sge/default/common/settings.sh + /opt/sge/util/qconf_mod_attr -mconf max_unheard 00:03:00 global + /opt/sge/util/qconf_mod_attr -mconf reschedule_unknown 00:00:30 global + /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN global + /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_RESCHEDULE_KILL global + CONFIGUNKNOWN +end From 60bbf347101e854ff87bd70a0bdf28aa8ef89e4a Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 15 May 2019 11:21:51 -0700 Subject: [PATCH 31/50] Test for EFA Install Signed-off-by: Sean Smith --- recipes/tests.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/recipes/tests.rb b/recipes/tests.rb index a17df82ac2..6d8c9b8feb 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,6 +99,13 @@ end end +if node['cfncluster']['os'] == 'alinux' || node['cfncluster']['os'] == 'centos7' + execute 'check efa rpm installed' do + command "rpm -qa | grep libfabric && rpm -qa | grep efa-" + user node['cfncluster']['cfn_cluster_user'] + end +end + unless node['cfncluster']['os'].end_with?("-custom") bash 'execute jq' do cwd Chef::Config[:file_cache_path] From b40535d7cdf86aafb5820aa3a64045237f3a216c Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 24 May 2019 12:26:37 +0200 Subject: [PATCH 32/50] Fix for when ParallelCluster version contains letters e.g. (2.3.2a1) Signed-off-by: Luca Carrogu --- util/bump-version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/bump-version.sh b/util/bump-version.sh index e02c838bb0..a832a17900 100755 --- a/util/bump-version.sh +++ b/util/bump-version.sh @@ -10,7 +10,7 @@ fi NEW_VERSION=$1 CURRENT_VERSION=$(sed -ne "s/^version '\(.*\)'/\1/p" metadata.rb) -sed -i -e "s/\(.*parallelcluster.*version.*\)$CURRENT_VERSION\(.*\)/\1$NEW_VERSION\2/g" amis/packer_variables.json +sed -i -e "s/\(.*parallelcluster.*version.*\)$CURRENT_VERSION.*\(\".*\)/\1$NEW_VERSION\2/g" amis/packer_variables.json sed -i "s/default\['cfncluster'\]\['cfncluster-version'\] = '$CURRENT_VERSION'/default['cfncluster']['cfncluster-version'] = '$NEW_VERSION'/g" attributes/default.rb sed -i "s/default\['cfncluster'\]\['cfncluster-node-version'\] = '$CURRENT_VERSION'/default['cfncluster']['cfncluster-node-version'] = '$NEW_VERSION'/g" attributes/default.rb sed -i "s/version '$CURRENT_VERSION'/version '$NEW_VERSION'/g" metadata.rb From 2cb05769f114d814b5193310ed7d66bc09cecbe3 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 21 Mar 2019 13:03:22 -0700 Subject: [PATCH 33/50] Install EFA drivers Installs the EFA drivers. See [1] [1] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-enable Signed-off-by: Sean Smith --- amis/packer_alinux.json | 1 + amis/packer_centos6.json | 1 + amis/packer_centos7.json | 1 + amis/packer_ubuntu1404.json | 1 + amis/packer_ubuntu1604.json | 1 + attributes/default.rb | 2 ++ recipes/_efa_install.rb | 37 +++++++++++++++++++++++++++++++++++++ recipes/_lustre_install.rb | 2 +- recipes/base_config.rb | 4 ++-- recipes/base_install.rb | 5 +++++ recipes/sge_install.rb | 1 - 11 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 recipes/_efa_install.rb diff --git a/amis/packer_alinux.json b/amis/packer_alinux.json index 87f305463d..913e670ebd 100644 --- a/amis/packer_alinux.json +++ b/amis/packer_alinux.json @@ -210,6 +210,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, diff --git a/amis/packer_centos6.json b/amis/packer_centos6.json index bef51b0efc..a6e23513c7 100644 --- a/amis/packer_centos6.json +++ b/amis/packer_centos6.json @@ -219,6 +219,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, diff --git a/amis/packer_centos7.json b/amis/packer_centos7.json index a403d86465..7bced7a62f 100644 --- a/amis/packer_centos7.json +++ b/amis/packer_centos7.json @@ -224,6 +224,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, diff --git a/amis/packer_ubuntu1404.json b/amis/packer_ubuntu1404.json index a04213d51f..dea5dd40c4 100644 --- a/amis/packer_ubuntu1404.json +++ b/amis/packer_ubuntu1404.json @@ -224,6 +224,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, diff --git a/amis/packer_ubuntu1604.json b/amis/packer_ubuntu1604.json index 4adcb2d0e7..9e3ce206b4 100644 --- a/amis/packer_ubuntu1604.json +++ b/amis/packer_ubuntu1604.json @@ -227,6 +227,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, diff --git a/attributes/default.rb b/attributes/default.rb index b7e9b5b446..a434e79d75 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -45,6 +45,8 @@ default['cfncluster']['nvidia']['enabled'] = 'no' default['cfncluster']['nvidia']['driver_url'] = 'http://download.nvidia.com/XFree86/Linux-x86_64/418.56/NVIDIA-Linux-x86_64-418.56.run' default['cfncluster']['nvidia']['cuda_url'] = 'https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux' +# EFA +default['cfncluster']['efa']['installer_url'] = 'https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz' # Reboot after default_pre recipe default['cfncluster']['default_pre_reboot'] = 'true' diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb new file mode 100644 index 0000000000..93a0c84b19 --- /dev/null +++ b/recipes/_efa_install.rb @@ -0,0 +1,37 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: _efa_install +# +# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" + +# Get EFA Installer +remote_file efa_tarball do + source node['cfncluster']['efa']['installer_url'] + mode '0644' + retries 3 + retry_delay 5 + not_if { ::File.exist?(efa_tarball) } +end + +bash "install efa" do + cwd Chef::Config[:file_cache_path] + code <<-NODE + # default openmpi installation conflicts with new install + # new one is installed in /opt/amazon/efa/bin/ + yum remove -y openmpi openmpi-devel + tar -xzf #{efa_tarball} + cd aws-efa-installer + ./efa_installer.sh -y + NODE +end diff --git a/recipes/_lustre_install.rb b/recipes/_lustre_install.rb index 52498f073e..63b3ad4ec5 100644 --- a/recipes/_lustre_install.rb +++ b/recipes/_lustre_install.rb @@ -55,4 +55,4 @@ retry_delay 5 end -end \ No newline at end of file +end diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 509a5f599e..8f35fe95d5 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -53,8 +53,8 @@ mode '0644' end -# Only run FSx on centos for now -if node['platform'] == 'centos' or node['platform'] == 'amazon' +# Run FSx on centos and alinux +if node['platform'] == 'centos' || node['platform'] == 'amazon' # Mount FSx include_recipe 'aws-parallelcluster::fsx_mount' end diff --git a/recipes/base_install.rb b/recipes/base_install.rb index 27d348104e..49aa875257 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -235,3 +235,8 @@ # Install FSx options include_recipe "aws-parallelcluster::_lustre_install" + +# Install EFA +if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' + include_recipe "aws-parallelcluster::_efa_install" unless node['cfncluster']['cfn_region'].start_with?("cn-") +end diff --git a/recipes/sge_install.rb b/recipes/sge_install.rb index 0da68929e2..2a9cc336d4 100644 --- a/recipes/sge_install.rb +++ b/recipes/sge_install.rb @@ -124,4 +124,3 @@ shell '/bin/bash' end end - From 347eb2c913f6a9d752a3ca2cd2223e89ff57cfe8 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 30 May 2019 10:49:09 +0200 Subject: [PATCH 34/50] Fix conflict between EFA package and openmpi-devel package Once the EFA package is installed, it is not possible to install the openmpi-devel package. Make installation conditional depending on the OS and region Signed-off-by: Luca Carrogu --- attributes/default.rb | 4 ++-- recipes/base_install.rb | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index a434e79d75..7f8b79f89f 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -85,7 +85,7 @@ if node['platform_version'].to_i >= 7 default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel openmpi-devel R atlas-devel + httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm] if node['platform_version'].split('.')[1] == '6' @@ -107,7 +107,7 @@ when 'amazon' default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel fftw-devel + httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server sendmail cmake byacc libglvnd-devel mdadm] end diff --git a/recipes/base_install.rb b/recipes/base_install.rb index 49aa875257..e3c3b4b2f3 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -238,5 +238,12 @@ # Install EFA if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' - include_recipe "aws-parallelcluster::_efa_install" unless node['cfncluster']['cfn_region'].start_with?("cn-") + unless node['cfncluster']['cfn_region'].start_with?("cn-") + include_recipe "aws-parallelcluster::_efa_install" + else + package 'openmpi-devel' do + retries 3 + retry_delay 5 + end + end end From 3fc617027ffdac21b3382f18351fbebfcfd1b050 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 30 May 2019 11:05:10 +0200 Subject: [PATCH 35/50] Do not reinstall EFA package at runtime Signed-off-by: Luca Carrogu --- recipes/_efa_install.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb index 93a0c84b19..3139240a34 100644 --- a/recipes/_efa_install.rb +++ b/recipes/_efa_install.rb @@ -26,12 +26,13 @@ bash "install efa" do cwd Chef::Config[:file_cache_path] - code <<-NODE + code <<-EFAINSTALL # default openmpi installation conflicts with new install # new one is installed in /opt/amazon/efa/bin/ yum remove -y openmpi openmpi-devel tar -xzf #{efa_tarball} cd aws-efa-installer ./efa_installer.sh -y - NODE + EFAINSTALL + creates '/opt/amazon/efa/bin/mpirun' end From 3fd5819d8245218cac261b5cab6f69a7f8739819 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 31 May 2019 12:23:10 +0200 Subject: [PATCH 36/50] sge: set ENABLE_RESCHEDULE_KILL=1 Signed-off-by: Francesco De Martino --- recipes/_master_sge_config.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/_master_sge_config.rb b/recipes/_master_sge_config.rb index 301c69296a..cbe1be4886 100644 --- a/recipes/_master_sge_config.rb +++ b/recipes/_master_sge_config.rb @@ -84,6 +84,6 @@ /opt/sge/util/qconf_mod_attr -mconf max_unheard 00:03:00 global /opt/sge/util/qconf_mod_attr -mconf reschedule_unknown 00:00:30 global /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN global - /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_RESCHEDULE_KILL global + /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_RESCHEDULE_KILL=1 global CONFIGUNKNOWN end From 14b02a2d50f4fd60747383d4deb52fc5ecf0896b Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 30 May 2019 09:28:27 -0700 Subject: [PATCH 37/50] Install EFA at Runtime * fetch installer during ami build (or at runtime with custom ami) * install only on Compute Nodes Signed-off-by: Sean Smith --- attributes/default.rb | 4 ++-- recipes/_efa_fetch.rb | 25 +++++++++++++++++++++++++ recipes/_efa_install.rb | 9 --------- recipes/base_config.rb | 5 +++++ recipes/base_install.rb | 13 +++---------- 5 files changed, 35 insertions(+), 21 deletions(-) create mode 100644 recipes/_efa_fetch.rb diff --git a/attributes/default.rb b/attributes/default.rb index 7f8b79f89f..a434e79d75 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -85,7 +85,7 @@ if node['platform_version'].to_i >= 7 default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel + httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel openmpi-devel R atlas-devel blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm] if node['platform_version'].split('.')[1] == '6' @@ -107,7 +107,7 @@ when 'amazon' default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel + httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel fftw-devel libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server sendmail cmake byacc libglvnd-devel mdadm] end diff --git a/recipes/_efa_fetch.rb b/recipes/_efa_fetch.rb new file mode 100644 index 0000000000..7ed716cd91 --- /dev/null +++ b/recipes/_efa_fetch.rb @@ -0,0 +1,25 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: _efa_fetch +# +# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" + +# Get EFA Installer +remote_file efa_tarball do + source node['cfncluster']['efa']['installer_url'] + mode '0644' + retries 3 + retry_delay 5 + not_if { ::File.exist?(efa_tarball) } +end diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb index 3139240a34..c80e62a949 100644 --- a/recipes/_efa_install.rb +++ b/recipes/_efa_install.rb @@ -15,15 +15,6 @@ efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" -# Get EFA Installer -remote_file efa_tarball do - source node['cfncluster']['efa']['installer_url'] - mode '0644' - retries 3 - retry_delay 5 - not_if { ::File.exist?(efa_tarball) } -end - bash "install efa" do cwd Chef::Config[:file_cache_path] code <<-EFAINSTALL diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 8f35fe95d5..7bad9c8728 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -58,3 +58,8 @@ # Mount FSx include_recipe 'aws-parallelcluster::fsx_mount' end + +# Install EFA +if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' + include_recipe "aws-parallelcluster::_efa_install" +end diff --git a/recipes/base_install.rb b/recipes/base_install.rb index e3c3b4b2f3..ac9583acaa 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -236,14 +236,7 @@ # Install FSx options include_recipe "aws-parallelcluster::_lustre_install" -# Install EFA -if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' - unless node['cfncluster']['cfn_region'].start_with?("cn-") - include_recipe "aws-parallelcluster::_efa_install" - else - package 'openmpi-devel' do - retries 3 - retry_delay 5 - end - end +# Fetch EFA Driver +unless node['cfncluster']['cfn_region'].start_with?("cn-") + include_recipe "aws-parallelcluster::_efa_fetch" end From 8ffcf39e2ae241ba5c37b50bdffe73976af88d0d Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 31 May 2019 09:15:59 -0700 Subject: [PATCH 38/50] Enable EFA Kitchen Test Signed-off-by: Sean Smith --- .kitchen.yml | 7 +++++++ recipes/tests.rb | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.kitchen.yml b/.kitchen.yml index 7fe74645f1..ec0a489704 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -70,6 +70,7 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: sge_config_MasterServer run_list: @@ -92,6 +93,7 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: torque_config_MasterServer run_list: @@ -114,6 +116,7 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: slurm_config_MasterServer run_list: @@ -136,6 +139,7 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: sge_config_ComputeFleet run_list: @@ -158,6 +162,7 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: torque_config_ComputeFleet run_list: @@ -180,6 +185,7 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: slurm_config_ComputeFleet run_list: @@ -202,3 +208,4 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> + enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> diff --git a/recipes/tests.rb b/recipes/tests.rb index 6d8c9b8feb..d620c96fc6 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,7 +99,8 @@ end end -if node['cfncluster']['os'] == 'alinux' || node['cfncluster']['os'] == 'centos7' +# Test EFA is installed +if node['cfncluster']['enable_efa'] == 'compute' execute 'check efa rpm installed' do command "rpm -qa | grep libfabric && rpm -qa | grep efa-" user node['cfncluster']['cfn_cluster_user'] From 2ec3d129bbdec24cbb25e7a8a364da77fc69b137 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 3 Jun 2019 14:16:27 +0200 Subject: [PATCH 39/50] Apply patch to fix issue in sge_edit_mod_attr script Signed-off-by: Francesco De Martino --- recipes/_master_sge_config.rb | 1 + recipes/sge_install.rb | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/recipes/_master_sge_config.rb b/recipes/_master_sge_config.rb index cbe1be4886..f45d287b38 100644 --- a/recipes/_master_sge_config.rb +++ b/recipes/_master_sge_config.rb @@ -80,6 +80,7 @@ # ENABLE_RESCHEDULE_KILL: reschedule_unknown parameter affects also jobs which have the rerun flag not activated bash "configure_unknown_hosts_behaviour" do code <<-CONFIGUNKNOWN + set -e . /opt/sge/default/common/settings.sh /opt/sge/util/qconf_mod_attr -mconf max_unheard 00:03:00 global /opt/sge/util/qconf_mod_attr -mconf reschedule_unknown 00:00:30 global diff --git a/recipes/sge_install.rb b/recipes/sge_install.rb index 2a9cc336d4..86973a7065 100644 --- a/recipes/sge_install.rb +++ b/recipes/sge_install.rb @@ -62,8 +62,13 @@ bash "extract_qconf_util" do code <<-EXTRACTQCONFUTIL + set -e tar xf /opt/sge/util/qconf_scripts.tar.gz -C /opt/sge/util --strip-components=1 --no-same-permissions --no-same-owner + # applying small patch for a bug in sge_edit_mod_attr script + # [[]] is incompatible with dash which is the default sh in ubuntu + sed -i 's/if \\[\\[ $cc -eq 0 ]]/if [ $cc -eq 0 ]/g' /opt/sge/util/sge_edit_mod_attr EXTRACTQCONFUTIL + creates '/opt/sge/util/sge_edit_mod_attr' end # Disbale the AddQueue, so that we can manage slots per instance From bffb47ecba08fcb2d1518d20f1ca5b0867158e30 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 31 May 2019 16:21:57 -0700 Subject: [PATCH 40/50] Revert "Enable EFA Kitchen Test" This reverts commit 8ffcf39e2ae241ba5c37b50bdffe73976af88d0d. --- .kitchen.yml | 7 ------- recipes/tests.rb | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.kitchen.yml b/.kitchen.yml index ec0a489704..7fe74645f1 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -70,7 +70,6 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: sge_config_MasterServer run_list: @@ -93,7 +92,6 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: torque_config_MasterServer run_list: @@ -116,7 +114,6 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: slurm_config_MasterServer run_list: @@ -139,7 +136,6 @@ suites: cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: sge_config_ComputeFleet run_list: @@ -162,7 +158,6 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: torque_config_ComputeFleet run_list: @@ -185,7 +180,6 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> - name: slurm_config_ComputeFleet run_list: @@ -208,4 +202,3 @@ suites: cfn_master: <%= ENV['CFN_MASTER'] %> custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> os: <%= ENV['OS'] %> - enable_efa: <%= ENV['ENABLE_EFA'] || "NONE" %> diff --git a/recipes/tests.rb b/recipes/tests.rb index d620c96fc6..6d8c9b8feb 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,8 +99,7 @@ end end -# Test EFA is installed -if node['cfncluster']['enable_efa'] == 'compute' +if node['cfncluster']['os'] == 'alinux' || node['cfncluster']['os'] == 'centos7' execute 'check efa rpm installed' do command "rpm -qa | grep libfabric && rpm -qa | grep efa-" user node['cfncluster']['cfn_cluster_user'] From 00d9674d6e476ef1a085ca41c5be8f8fbf061e70 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 31 May 2019 16:27:49 -0700 Subject: [PATCH 41/50] Revert "Install EFA at Runtime" This reverts commit 14b02a2d50f4fd60747383d4deb52fc5ecf0896b. --- attributes/default.rb | 4 ++-- recipes/_efa_fetch.rb | 25 ------------------------- recipes/_efa_install.rb | 9 +++++++++ recipes/base_config.rb | 5 ----- recipes/base_install.rb | 13 ++++++++++--- 5 files changed, 21 insertions(+), 35 deletions(-) delete mode 100644 recipes/_efa_fetch.rb diff --git a/attributes/default.rb b/attributes/default.rb index a434e79d75..7f8b79f89f 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -85,7 +85,7 @@ if node['platform_version'].to_i >= 7 default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel openmpi-devel R atlas-devel + httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm] if node['platform_version'].split('.')[1] == '6' @@ -107,7 +107,7 @@ when 'amazon' default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel fftw-devel + httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server sendmail cmake byacc libglvnd-devel mdadm] end diff --git a/recipes/_efa_fetch.rb b/recipes/_efa_fetch.rb deleted file mode 100644 index 7ed716cd91..0000000000 --- a/recipes/_efa_fetch.rb +++ /dev/null @@ -1,25 +0,0 @@ -# -# Cookbook Name:: aws-parallelcluster -# Recipe:: _efa_fetch -# -# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" - -# Get EFA Installer -remote_file efa_tarball do - source node['cfncluster']['efa']['installer_url'] - mode '0644' - retries 3 - retry_delay 5 - not_if { ::File.exist?(efa_tarball) } -end diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb index c80e62a949..3139240a34 100644 --- a/recipes/_efa_install.rb +++ b/recipes/_efa_install.rb @@ -15,6 +15,15 @@ efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" +# Get EFA Installer +remote_file efa_tarball do + source node['cfncluster']['efa']['installer_url'] + mode '0644' + retries 3 + retry_delay 5 + not_if { ::File.exist?(efa_tarball) } +end + bash "install efa" do cwd Chef::Config[:file_cache_path] code <<-EFAINSTALL diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 7bad9c8728..8f35fe95d5 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -58,8 +58,3 @@ # Mount FSx include_recipe 'aws-parallelcluster::fsx_mount' end - -# Install EFA -if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' - include_recipe "aws-parallelcluster::_efa_install" -end diff --git a/recipes/base_install.rb b/recipes/base_install.rb index ac9583acaa..e3c3b4b2f3 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -236,7 +236,14 @@ # Install FSx options include_recipe "aws-parallelcluster::_lustre_install" -# Fetch EFA Driver -unless node['cfncluster']['cfn_region'].start_with?("cn-") - include_recipe "aws-parallelcluster::_efa_fetch" +# Install EFA +if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' + unless node['cfncluster']['cfn_region'].start_with?("cn-") + include_recipe "aws-parallelcluster::_efa_install" + else + package 'openmpi-devel' do + retries 3 + retry_delay 5 + end + end end From d1639f56fc0d2b037b26281799f13ea5303c5546 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 31 May 2019 16:33:03 -0700 Subject: [PATCH 42/50] Enable EFA Limit Check at Runtime This builds the rpms into the ami, then set the limits at runtime. Signed-off-by: Sean Smith --- recipes/_efa_enable.rb | 22 ++++++++++++++++++++++ recipes/_efa_install.rb | 4 ++-- recipes/base_config.rb | 5 +++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 recipes/_efa_enable.rb diff --git a/recipes/_efa_enable.rb b/recipes/_efa_enable.rb new file mode 100644 index 0000000000..8eea1e8da7 --- /dev/null +++ b/recipes/_efa_enable.rb @@ -0,0 +1,22 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: _efa_enable +# +# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +bash "enable efa limits" do + cwd node['cfncluster']['sources_dir'] + code <<-EFAENABLE + cd aws-efa-installer/install + ./efa_limits_setup.sh + EFAENABLE +end diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb index 3139240a34..9e7db04bd8 100644 --- a/recipes/_efa_install.rb +++ b/recipes/_efa_install.rb @@ -25,14 +25,14 @@ end bash "install efa" do - cwd Chef::Config[:file_cache_path] + cwd node['cfncluster']['sources_dir'] code <<-EFAINSTALL # default openmpi installation conflicts with new install # new one is installed in /opt/amazon/efa/bin/ yum remove -y openmpi openmpi-devel tar -xzf #{efa_tarball} cd aws-efa-installer - ./efa_installer.sh -y + ./efa_installer.sh -y --skip-limit-conf EFAINSTALL creates '/opt/amazon/efa/bin/mpirun' end diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 8f35fe95d5..e0f171d161 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -58,3 +58,8 @@ # Mount FSx include_recipe 'aws-parallelcluster::fsx_mount' end + +# Enable EFA +if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' + include_recipe "aws-parallelcluster::_efa_enable" +end \ No newline at end of file From 344e14a707b54850234127afa158ae22281139e6 Mon Sep 17 00:00:00 2001 From: Alex Ford Date: Tue, 4 Jun 2019 23:32:00 -0700 Subject: [PATCH 43/50] Unconditionally attempt fsx mount. Update base_config recipe to perform an unconditional attempt of fsx filesysterm mount, rather than restricting to alinux/centos. Supports cases with custom ubuntu amis with fsx extensions installed. This is a no-op change in the default parallelcluster configuration, as the client also verifies os compatibility during configuration validation. Tidy tcommon call of efs mount from master/compute recipes into base_config along fsx mount. --- recipes/_compute_base_config.rb | 3 --- recipes/_master_base_config.rb | 3 --- recipes/base_config.rb | 12 ++++++------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/recipes/_compute_base_config.rb b/recipes/_compute_base_config.rb index cef0d9266c..e6049b8188 100644 --- a/recipes/_compute_base_config.rb +++ b/recipes/_compute_base_config.rb @@ -15,9 +15,6 @@ nfs_master = node['cfncluster']['cfn_master'] -# Mount EFS directory with efs_mount recipe -include_recipe 'aws-parallelcluster::efs_mount' - # Parse and get RAID shared directory info and turn into an array raid_shared_dir = node['cfncluster']['cfn_raid_parameters'].split(',')[0] diff --git a/recipes/_master_base_config.rb b/recipes/_master_base_config.rb index 2e0e9f596b..073ad3da5f 100644 --- a/recipes/_master_base_config.rb +++ b/recipes/_master_base_config.rb @@ -28,9 +28,6 @@ # Get VPC CIDR node.default['cfncluster']['ec2-metadata']['vpc-ipv4-cidr-block'] = get_vpc_ipv4_cidr_block(node['macaddress']) -# Mount EFS directory with efs_mount recipe -include_recipe 'aws-parallelcluster::efs_mount' - # Parse shared directory info and turn into an array shared_dir_array = node['cfncluster']['cfn_shared_dir'].split(',') shared_dir_array.each_with_index do |dir, index| diff --git a/recipes/base_config.rb b/recipes/base_config.rb index e0f171d161..3f921c83ed 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -53,13 +53,13 @@ mode '0644' end -# Run FSx on centos and alinux -if node['platform'] == 'centos' || node['platform'] == 'amazon' - # Mount FSx - include_recipe 'aws-parallelcluster::fsx_mount' -end +# Mount EFS directory with efs_mount recipe +include_recipe 'aws-parallelcluster::efs_mount' + +# Mount FSx directory with fsx_mount recipe +include_recipe 'aws-parallelcluster::fsx_mount' # Enable EFA if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' include_recipe "aws-parallelcluster::_efa_enable" -end \ No newline at end of file +end From 25ee19c4ac0ed87287978f379f6e9dc1ceeb8939 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 5 Jun 2019 11:37:15 -0700 Subject: [PATCH 44/50] Enable EFA on Ubuntu1604 Signed-off-by: Sean Smith --- attributes/default.rb | 5 ++++- recipes/_efa_install.rb | 16 +++++++++++++--- recipes/base_install.rb | 10 ++++++---- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/attributes/default.rb b/attributes/default.rb index 7f8b79f89f..d9407ed6c7 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -125,8 +125,11 @@ default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh libssl-dev ncurses-dev libpam-dev net-tools libhwloc-dev dkms tcl-dev automake autoconf python-parted libtool librrd-dev libapr1-dev libconfuse-dev apache2 libboost-dev libdb-dev tcsh libssl-dev libncurses5-dev libpam0g-dev libxt-dev - libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev libopenmpi-dev + libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev r-base libatlas-dev libblas-dev libfftw3-dev libffi-dev libssl-dev libxml2-dev mdadm] + if node['platform_version'] == '14.04' + default['cfncluster']['base_packages'].push('libopenmpi-dev') + end default['cfncluster']['kernel_generic_pkg'] = "linux-generic" default['cfncluster']['kernel_extra_pkg'] = "linux-image-extra-#{node['kernel']['release']}" default['cfncluster']['ganglia']['apache_user'] = 'www-data' diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb index 9e7db04bd8..4aa6498eef 100644 --- a/recipes/_efa_install.rb +++ b/recipes/_efa_install.rb @@ -24,12 +24,22 @@ not_if { ::File.exist?(efa_tarball) } end +# default openmpi installation conflicts with new install +# new one is installed in /opt/amazon/efa/bin/ +case node['platform_family'] +when 'rhel', 'amazon' + package %w[openmpi-devel openmpi] do + action :remove + end +when 'debian' + package "libopenmpi-dev" do + action :remove + end +end + bash "install efa" do cwd node['cfncluster']['sources_dir'] code <<-EFAINSTALL - # default openmpi installation conflicts with new install - # new one is installed in /opt/amazon/efa/bin/ - yum remove -y openmpi openmpi-devel tar -xzf #{efa_tarball} cd aws-efa-installer ./efa_installer.sh -y --skip-limit-conf diff --git a/recipes/base_install.rb b/recipes/base_install.rb index e3c3b4b2f3..9d37067544 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -237,13 +237,15 @@ include_recipe "aws-parallelcluster::_lustre_install" # Install EFA -if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' +if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' || (node['platform'] == 'ubuntu' && node['platform_version'] == "16.04") unless node['cfncluster']['cfn_region'].start_with?("cn-") include_recipe "aws-parallelcluster::_efa_install" else - package 'openmpi-devel' do - retries 3 - retry_delay 5 + case node['platform_family'] + when 'rhel', 'amazon' + package %w[openmpi-devel openmpi] + when 'debian' + package "libopenmpi-dev" end end end From 28c92eee62c875e4ffeca64e5e3dac3dbe66bd8f Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 5 Jun 2019 19:59:22 -0700 Subject: [PATCH 45/50] Add Ubuntu to Kitchen Tests Signed-off-by: Sean Smith --- recipes/tests.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/recipes/tests.rb b/recipes/tests.rb index 6d8c9b8feb..52acf84eca 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,11 +99,17 @@ end end -if node['cfncluster']['os'] == 'alinux' || node['cfncluster']['os'] == 'centos7' +case node['cfncluster']['os'] +when 'alinux', 'centos7' execute 'check efa rpm installed' do command "rpm -qa | grep libfabric && rpm -qa | grep efa-" user node['cfncluster']['cfn_cluster_user'] end +when 'ubuntu1604' + execute 'check efa rpm installed' do + command "dpkg -l | grep libfabric && dpkg -l | grep 'efa '" + user node['cfncluster']['cfn_cluster_user'] + end end unless node['cfncluster']['os'].end_with?("-custom") From f92426d8dab7156e11bebcdca5712499eb37e3e5 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 7 Jun 2019 08:15:12 +0200 Subject: [PATCH 46/50] Skip EFA test in China regions Signed-off-by: Luca Carrogu --- recipes/tests.rb | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/recipes/tests.rb b/recipes/tests.rb index 52acf84eca..07e8323a57 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -99,16 +99,18 @@ end end -case node['cfncluster']['os'] -when 'alinux', 'centos7' - execute 'check efa rpm installed' do - command "rpm -qa | grep libfabric && rpm -qa | grep efa-" - user node['cfncluster']['cfn_cluster_user'] - end -when 'ubuntu1604' - execute 'check efa rpm installed' do - command "dpkg -l | grep libfabric && dpkg -l | grep 'efa '" - user node['cfncluster']['cfn_cluster_user'] +unless node['cfncluster']['cfn_region'].start_with?("cn-") + case node['cfncluster']['os'] + when 'alinux', 'centos7' + execute 'check efa rpm installed' do + command "rpm -qa | grep libfabric && rpm -qa | grep efa-" + user node['cfncluster']['cfn_cluster_user'] + end + when 'ubuntu1604' + execute 'check efa rpm installed' do + command "dpkg -l | grep libfabric && dpkg -l | grep 'efa '" + user node['cfncluster']['cfn_cluster_user'] + end end end From 6afb48057396d3d1edc798d501231a70cc54d676 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 6 Jun 2019 17:49:37 +0200 Subject: [PATCH 47/50] Set ulimit nofile to be 1000 Signed-off-by: Luca Carrogu --- metadata.rb | 1 + recipes/_default_pre.rb | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/metadata.rb b/metadata.rb index 8e3111b763..48d6c1233b 100644 --- a/metadata.rb +++ b/metadata.rb @@ -27,3 +27,4 @@ depends 'apt', '~> 7.0.0' depends 'hostname', '~> 0.4.2' depends 'line', '~> 1.0.6' +depends 'ulimit', '~> 1.0.0' \ No newline at end of file diff --git a/recipes/_default_pre.rb b/recipes/_default_pre.rb index 4734b915a9..860e454326 100644 --- a/recipes/_default_pre.rb +++ b/recipes/_default_pre.rb @@ -13,6 +13,10 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +user_ulimit "*" do + filehandle_limit 10000 +end + include_recipe 'aws-parallelcluster::_update_packages' # Reboot after preliminary configuration steps From e44d47eb81553582cd8501a773e978c205796d1c Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 6 Jun 2019 12:23:48 +0200 Subject: [PATCH 48/50] Test soft ulimit nofile to be greater equal than 16384 Signed-off-by: Luca Carrogu --- recipes/tests.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/recipes/tests.rb b/recipes/tests.rb index 07e8323a57..2fe2b5c0ec 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -31,6 +31,13 @@ AWSREGIONS end +unless node['cfncluster']['os'].end_with?("-custom") + bash 'test soft ulimit nofile' do + code "if (($(ulimit -Sn) < 10000)); then exit 1; fi" + user node['cfncluster']['cfn_cluster_user'] + end +end + if node['cfncluster']['cfn_scheduler'] == 'sge' case node['cfncluster']['cfn_node_type'] when 'MasterServer' From ea8a22fa03929db8b7be04d2ca3c1febc159c001 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 6 Jun 2019 11:53:40 +0200 Subject: [PATCH 49/50] Update changelog for v2.4.0 Signed-off-by: Francesco De Martino --- CHANGELOG.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 576fcf92ec..f76ed00542 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,32 @@ This file is used to list changes made in each version of the AWS ParallelCluste ----- **ENHANCEMENTS** - +- Add support for EFA on Centos 7, Amazon Linux and Ubuntu 1604 - Add support for Ubuntu in China region `cn-northwest-1` +**CHANGES** +- SGE: changed following parameters in global configuration + - `max_unheard 00:03:00`: allows a faster reaction in case of faulty nodes + - `reschedule_unknown 00:00:30`: enables rescheduling of jobs running on failing nodes + - `qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN`: forces job deletion on unresponsive nodes + - `qmaster_params ENABLE_RESCHEDULE_KILL`: forces rescheduling or killing of jobs running on failing nodes +- Slurm: decrease SlurmdTimeout to 120 seconds to speed up replacement of faulty nodes +- Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking + setups and custom DNS configurations +- Set soft and hard ulimit on open files to 10000 for all supported OSs +- Pin python `supervisor` version to 3.4.0 +- Remove unused `compute_instance_type` from jobwatcher.cfg +- Removed unused `max_queue_size` from sqswatcher.cfg +- Remove double quoting of the post_install args + +**BUG FIXES** +- Fix issue that was preventing Torque from being used on Centos 7 +- Start node daemons at the end of instance initialization. The time spent for post-install script and node + initialization is not counted as part of node idletime anymore. +- Fix issue which was causing an additional and invalid EBS mount point to be added in case of multiple EBS +- Install Slurm libpmpi/libpmpi2 that is distributed in a separate package since Slurm 17 + + 2.3.1 ----- From 4afb7d672118aa192dd546b18ee4fcc4452e978d Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 10 Jun 2019 10:56:05 -0700 Subject: [PATCH 50/50] Set ulimits when EFA Enabled * Sets the max_memory ulimit on the master when EFA is enabled Signed-off-by: Sean Smith --- recipes/base_config.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 3f921c83ed..eeec359284 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -62,4 +62,8 @@ # Enable EFA if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' include_recipe "aws-parallelcluster::_efa_enable" +elsif node['cfncluster']['enable_efa'] == 'compute' + user_ulimit "*" do + memory_limit 'unlimited' + end end