diff --git a/.kitchen.cloud.yml b/.kitchen.cloud.yml index e99174d817..443e688abd 100644 --- a/.kitchen.cloud.yml +++ b/.kitchen.cloud.yml @@ -2,6 +2,7 @@ driver_config: retryable_sleep: 15 retryable_tries: 20 + retry_limit: 6 aws_ssh_key_id: <%= ENV['AWS_KEYPAIR_NAME'] %> region: <%= ENV['AWS_DEFAULT_REGION'] %> instance_type: <%= ENV['AWS_FLAVOR_ID'] %> @@ -16,6 +17,8 @@ driver_config: provisioner: name: chef_zero require_chef_omnibus: 14.2.0 + # use custom chef install URL to cope with issue https://github.com/chef/bento/issues/609 + chef_omnibus_url: https://raw.githubusercontent.com/aws/aws-parallelcluster-cookbook/develop/util/chef-install.sh retry_on_exit_code: - 35 # 35 is the exit code signaling that the node is rebooting max_retries: 1 diff --git a/.kitchen.yml b/.kitchen.yml index 6324e48a8d..7fe74645f1 100644 --- a/.kitchen.yml +++ b/.kitchen.yml @@ -68,11 +68,14 @@ suites: cfn_ephemeral_dir: <%= ENV['CFN_EPHEMERAL_DIR'] %> cfn_shared_dir: <%= ENV['CFN_SHARED_DIR'] %> cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: sge_config_MasterServer run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::sge_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -87,11 +90,14 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: torque_config_MasterServer run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::torque_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -106,11 +112,14 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: slurm_config_MasterServer run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::slurm_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -125,11 +134,14 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: sge_config_ComputeFleet run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::sge_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -144,11 +156,14 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: torque_config_ComputeFleet run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::torque_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -163,11 +178,14 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> - name: slurm_config_ComputeFleet run_list: - recipe[aws-parallelcluster::_prep_env] - recipe[aws-parallelcluster::slurm_config] + - recipe[aws-parallelcluster::finalize] - recipe[aws-parallelcluster::tests] attributes: cfncluster: @@ -182,3 +200,5 @@ suites: cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %> cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %> cfn_master: <%= ENV['CFN_MASTER'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + os: <%= ENV['OS'] %> diff --git a/CHANGELOG.md b/CHANGELOG.md index ede6861f6d..f76ed00542 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,36 @@ aws-parallelcluster-cookbook CHANGELOG This file is used to list changes made in each version of the AWS ParallelCluster cookbook. +2.4.0 +----- + +**ENHANCEMENTS** +- Add support for EFA on Centos 7, Amazon Linux and Ubuntu 1604 +- Add support for Ubuntu in China region `cn-northwest-1` + +**CHANGES** +- SGE: changed following parameters in global configuration + - `max_unheard 00:03:00`: allows a faster reaction in case of faulty nodes + - `reschedule_unknown 00:00:30`: enables rescheduling of jobs running on failing nodes + - `qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN`: forces job deletion on unresponsive nodes + - `qmaster_params ENABLE_RESCHEDULE_KILL`: forces rescheduling or killing of jobs running on failing nodes +- Slurm: decrease SlurmdTimeout to 120 seconds to speed up replacement of faulty nodes +- Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking + setups and custom DNS configurations +- Set soft and hard ulimit on open files to 10000 for all supported OSs +- Pin python `supervisor` version to 3.4.0 +- Remove unused `compute_instance_type` from jobwatcher.cfg +- Removed unused `max_queue_size` from sqswatcher.cfg +- Remove double quoting of the post_install args + +**BUG FIXES** +- Fix issue that was preventing Torque from being used on Centos 7 +- Start node daemons at the end of instance initialization. The time spent for post-install script and node + initialization is not counted as part of node idletime anymore. +- Fix issue which was causing an additional and invalid EBS mount point to be added in case of multiple EBS +- Install Slurm libpmpi/libpmpi2 that is distributed in a separate package since Slurm 17 + + 2.3.1 ----- diff --git a/amis/packer_alinux.json b/amis/packer_alinux.json index 494f34b165..913e670ebd 100644 --- a/amis/packer_alinux.json +++ b/amis/packer_alinux.json @@ -210,6 +210,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, @@ -246,7 +247,6 @@ }, { "type" : "shell", - "only": ["custom-alinux"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_centos6.json b/amis/packer_centos6.json index d2bc2df2fb..a6e23513c7 100644 --- a/amis/packer_centos6.json +++ b/amis/packer_centos6.json @@ -219,6 +219,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, @@ -251,7 +252,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] @@ -265,7 +266,6 @@ }, { "type" : "shell", - "only": ["custom-centos6"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_centos7.json b/amis/packer_centos7.json index fefa92c370..7bced7a62f 100644 --- a/amis/packer_centos7.json +++ b/amis/packer_centos7.json @@ -224,6 +224,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, @@ -256,7 +257,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "which pip2", "if [ $? -eq 0 ]; then sudo pip2 install /tmp/aws-cfn-bootstrap-latest.tar.gz; else sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz; fi" @@ -277,7 +278,6 @@ }, { "type" : "shell", - "only": ["custom-centos7"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_ubuntu1404.json b/amis/packer_ubuntu1404.json index 6edac7f68c..dea5dd40c4 100644 --- a/amis/packer_ubuntu1404.json +++ b/amis/packer_ubuntu1404.json @@ -224,6 +224,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, @@ -257,7 +258,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] @@ -271,7 +272,6 @@ }, { "type" : "shell", - "only": ["custom-ubuntu1404"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_ubuntu1604.json b/amis/packer_ubuntu1604.json index 350c6cfad3..9e3ce206b4 100644 --- a/amis/packer_ubuntu1604.json +++ b/amis/packer_ubuntu1604.json @@ -227,6 +227,7 @@ "pause_before": "2m", "json" : { "cfncluster" : { + "cfn_region": "{{user `region`}}", "nvidia" : { "enabled" : "{{user `nvidia_enabled`}}" }, @@ -260,7 +261,7 @@ "inline" : [ "region=\"{{user `region`}}\"", "bucket=\"s3.amazonaws.com\"", - "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"", + "[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", "curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz", "sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz" ] @@ -274,7 +275,6 @@ }, { "type" : "shell", - "only": ["custom-ubuntu1604"], "inline" : [ "sudo /usr/local/sbin/ami_cleanup.sh" ] diff --git a/amis/packer_variables.json b/amis/packer_variables.json index 456e6aa2c7..ca032b508c 100644 --- a/amis/packer_variables.json +++ b/amis/packer_variables.json @@ -1,6 +1,6 @@ { - "parallelcluster_version": "2.3.1", - "parallelcluster_cookbook_version": "2.3.1", + "parallelcluster_version": "2.4.0", + "parallelcluster_cookbook_version": "2.4.0", "chef_version": "14.2.0", "ridley_version": "5.1.1", "berkshelf_version": "7.0.4" diff --git a/attributes/default.rb b/attributes/default.rb index 8dd6533a90..d9407ed6c7 100644 --- a/attributes/default.rb +++ b/attributes/default.rb @@ -19,9 +19,9 @@ default['cfncluster']['scripts_dir'] = "#{node['cfncluster']['base_dir']}/scripts" default['cfncluster']['license_dir'] = "#{node['cfncluster']['base_dir']}/licenses" # Python packages -default['cfncluster']['cfncluster-version'] = '2.3.1' -default['cfncluster']['cfncluster-node-version'] = '2.3.1' -default['cfncluster']['cfncluster-supervisor-version'] = '3.3.1' +default['cfncluster']['cfncluster-version'] = '2.4.0' +default['cfncluster']['cfncluster-node-version'] = '2.4.0' +default['cfncluster']['supervisor-version'] = '3.4.0' # URLs to software packages used during install recipes # Gridengine software default['cfncluster']['sge']['version'] = '8.1.9' @@ -45,6 +45,8 @@ default['cfncluster']['nvidia']['enabled'] = 'no' default['cfncluster']['nvidia']['driver_url'] = 'http://download.nvidia.com/XFree86/Linux-x86_64/418.56/NVIDIA-Linux-x86_64-418.56.run' default['cfncluster']['nvidia']['cuda_url'] = 'https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux' +# EFA +default['cfncluster']['efa']['installer_url'] = 'https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz' # Reboot after default_pre recipe default['cfncluster']['default_pre_reboot'] = 'true' @@ -83,7 +85,7 @@ if node['platform_version'].to_i >= 7 default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel openmpi-devel R atlas-devel + httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm] if node['platform_version'].split('.')[1] == '6' @@ -105,7 +107,7 @@ when 'amazon' default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool - httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel fftw-devel + httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server sendmail cmake byacc libglvnd-devel mdadm] end @@ -123,8 +125,11 @@ default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh libssl-dev ncurses-dev libpam-dev net-tools libhwloc-dev dkms tcl-dev automake autoconf python-parted libtool librrd-dev libapr1-dev libconfuse-dev apache2 libboost-dev libdb-dev tcsh libssl-dev libncurses5-dev libpam0g-dev libxt-dev - libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev libopenmpi-dev + libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev r-base libatlas-dev libblas-dev libfftw3-dev libffi-dev libssl-dev libxml2-dev mdadm] + if node['platform_version'] == '14.04' + default['cfncluster']['base_packages'].push('libopenmpi-dev') + end default['cfncluster']['kernel_generic_pkg'] = "linux-generic" default['cfncluster']['kernel_extra_pkg'] = "linux-image-extra-#{node['kernel']['release']}" default['cfncluster']['ganglia']['apache_user'] = 'www-data' @@ -166,7 +171,6 @@ default['cfncluster']['cfn_shared_dir'] = '/shared' default['cfncluster']['cfn_efs_shared_dir'] = 'NONE' default['cfncluster']['cfn_efs'] = nil -default['cfncluster']['cfn_node_type'] = nil default['cfncluster']['cfn_master'] = nil default['cfncluster']['cfn_cluster_user'] = 'ec2-user' default['cfncluster']['cfn_fsx_options'] = 'NONE' diff --git a/files/default/ami_cleanup.sh b/files/default/ami_cleanup.sh index 57515fa011..c6993be854 100644 --- a/files/default/ami_cleanup.sh +++ b/files/default/ami_cleanup.sh @@ -5,5 +5,13 @@ rm -rf /var/lib/cloud/instances/* rm -f /var/lib/cloud/instance rm -rf /etc/ssh/ssh_host_* rm -f /etc/udev/rules.d/70-persistent-net.rules +grep -l "Created by cloud-init on instance boot automatically" /etc/sysconfig/network-scripts/ifcfg-* | xargs rm -f + +# https://bugs.centos.org/view.php?id=13836#c33128 +source /etc/os-release +if [ "${ID}${VERSION_ID}" == "centos7" ]; then + rm -f /etc/sysconfig/network-scripts/ifcfg-eth0 +fi + find /var/log -type f -exec /bin/rm -v {} \; -touch /var/log/lastlog \ No newline at end of file +touch /var/log/lastlog diff --git a/files/default/compute_ready b/files/default/compute_ready index f557feb36a..3fa2dc53af 100644 --- a/files/default/compute_ready +++ b/files/default/compute_ready @@ -1,4 +1,5 @@ #!/bin/bash +set -e . /etc/parallelcluster/cfnconfig diff --git a/files/default/setup-ephemeral-drives.sh b/files/default/setup-ephemeral-drives.sh index b35201cd35..1b1c2c831d 100644 --- a/files/default/setup-ephemeral-drives.sh +++ b/files/default/setup-ephemeral-drives.sh @@ -21,6 +21,13 @@ function error_exit () { exit 1 } +function exec_command() { + _command_output=$($@ 2>&1) + _exit_code=$? + + # Do not set RC=1 if error says that changes have been written but a reboot is required to inform the kernel + [[ $_exit_code -ne 0 && $(echo "${_command_output}" | grep -i "you should reboot now") ]] && RC=1 +} # LVM stripe, format, mount ephemeral drives function setup_ephemeral_drives () { @@ -48,11 +55,11 @@ function setup_ephemeral_drives () { for d in $DEVS; do d=/dev/${d} dd if=/dev/zero of=${d} bs=32k count=1 || RC=1 - parted -s ${d} mklabel gpt || RC=1 - parted -s ${d} || RC=1 - parted -s -a optimal ${d} mkpart primary 1MB 100% || RC=1 - partprobe - parted -s ${d} set 1 lvm on || RC=1 + exec_command "parted -s ${d} mklabel gpt" + exec_command "parted -s ${d}" + exec_command "parted -s -a optimal ${d} mkpart primary 1MB 100%" + partprobe ${d} + exec_command "parted -s ${d} set 1 lvm on" if [ $IS_NVME -eq 1 ]; then PARTITIONS="${d}p1 $PARTITIONS" else diff --git a/metadata.rb b/metadata.rb index d5750390f8..48d6c1233b 100644 --- a/metadata.rb +++ b/metadata.rb @@ -7,7 +7,7 @@ issues_url 'https://github.com/aws/aws-parallelcluster-cookbook/issues' source_url 'https://github.com/aws/aws-parallelcluster-cookbook' chef_version '14.2.0' -version '2.3.1' +version '2.4.0' supports 'amazon' supports 'centos', '= 6' @@ -27,3 +27,4 @@ depends 'apt', '~> 7.0.0' depends 'hostname', '~> 0.4.2' depends 'line', '~> 1.0.6' +depends 'ulimit', '~> 1.0.0' \ No newline at end of file diff --git a/recipes/_compute_base_config.rb b/recipes/_compute_base_config.rb index 515fa41b6d..e6049b8188 100644 --- a/recipes/_compute_base_config.rb +++ b/recipes/_compute_base_config.rb @@ -13,22 +13,8 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -# Created shared mount point -directory node['cfncluster']['cfn_shared_dir'] do - mode '1777' - owner 'root' - group 'root' - recursive true - action :create -end - -node.default['cfncluster']['cfn_master'] = node['cfncluster']['cfn_master'].split('.')[0] - nfs_master = node['cfncluster']['cfn_master'] -# Mount EFS directory with efs_mount recipe -include_recipe 'aws-parallelcluster::efs_mount' - # Parse and get RAID shared directory info and turn into an array raid_shared_dir = node['cfncluster']['cfn_raid_parameters'].split(',')[0] diff --git a/recipes/_compute_sge_config.rb b/recipes/_compute_sge_config.rb index 88aaa1ae0b..c37389e6c3 100644 --- a/recipes/_compute_sge_config.rb +++ b/recipes/_compute_sge_config.rb @@ -14,7 +14,7 @@ # limitations under the License. # Mount /opt/sge over NFS -nfs_master = node['cfncluster']['cfn_master'].split('.')[0] +nfs_master = node['cfncluster']['cfn_master'] mount '/opt/sge' do device "#{nfs_master}:/opt/sge" fstype "nfs" diff --git a/recipes/_compute_slurm_config.rb b/recipes/_compute_slurm_config.rb index e23f2deda7..5f056f62c5 100644 --- a/recipes/_compute_slurm_config.rb +++ b/recipes/_compute_slurm_config.rb @@ -14,7 +14,7 @@ # limitations under the License. # Mount /opt/slurm over NFS -nfs_master = node['cfncluster']['cfn_master'].split('.')[0] +nfs_master = node['cfncluster']['cfn_master'] mount '/opt/slurm' do device "#{nfs_master}:/opt/slurm" fstype "nfs" diff --git a/recipes/_compute_torque_config.rb b/recipes/_compute_torque_config.rb index a049c10abd..9db0347f92 100644 --- a/recipes/_compute_torque_config.rb +++ b/recipes/_compute_torque_config.rb @@ -31,7 +31,11 @@ end # Enable and start pbs_mom service +# pbs_mom is restarted only after network service is restarted in +# order to wait for the hostname changes to be applied service "pbs_mom" do supports restart: true - action %i[enable restart] + action %i[enable start] + subscribes :restart, 'service[network]', :immediately + subscribes :restart, 'ohai[reload_hostname]', :delayed end diff --git a/recipes/_default_pre.rb b/recipes/_default_pre.rb index 4734b915a9..860e454326 100644 --- a/recipes/_default_pre.rb +++ b/recipes/_default_pre.rb @@ -13,6 +13,10 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +user_ulimit "*" do + filehandle_limit 10000 +end + include_recipe 'aws-parallelcluster::_update_packages' # Reboot after preliminary configuration steps diff --git a/recipes/_efa_enable.rb b/recipes/_efa_enable.rb new file mode 100644 index 0000000000..8eea1e8da7 --- /dev/null +++ b/recipes/_efa_enable.rb @@ -0,0 +1,22 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: _efa_enable +# +# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +bash "enable efa limits" do + cwd node['cfncluster']['sources_dir'] + code <<-EFAENABLE + cd aws-efa-installer/install + ./efa_limits_setup.sh + EFAENABLE +end diff --git a/recipes/_efa_install.rb b/recipes/_efa_install.rb new file mode 100644 index 0000000000..4aa6498eef --- /dev/null +++ b/recipes/_efa_install.rb @@ -0,0 +1,48 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: _efa_install +# +# Copyright 2013-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +efa_tarball = "#{node['cfncluster']['sources_dir']}/aws-efa-installer-latest.tar.gz" + +# Get EFA Installer +remote_file efa_tarball do + source node['cfncluster']['efa']['installer_url'] + mode '0644' + retries 3 + retry_delay 5 + not_if { ::File.exist?(efa_tarball) } +end + +# default openmpi installation conflicts with new install +# new one is installed in /opt/amazon/efa/bin/ +case node['platform_family'] +when 'rhel', 'amazon' + package %w[openmpi-devel openmpi] do + action :remove + end +when 'debian' + package "libopenmpi-dev" do + action :remove + end +end + +bash "install efa" do + cwd node['cfncluster']['sources_dir'] + code <<-EFAINSTALL + tar -xzf #{efa_tarball} + cd aws-efa-installer + ./efa_installer.sh -y --skip-limit-conf + EFAINSTALL + creates '/opt/amazon/efa/bin/mpirun' +end diff --git a/recipes/_lustre_install.rb b/recipes/_lustre_install.rb index 52498f073e..63b3ad4ec5 100644 --- a/recipes/_lustre_install.rb +++ b/recipes/_lustre_install.rb @@ -55,4 +55,4 @@ retry_delay 5 end -end \ No newline at end of file +end diff --git a/recipes/_master_base_config.rb b/recipes/_master_base_config.rb index 2e0e9f596b..073ad3da5f 100644 --- a/recipes/_master_base_config.rb +++ b/recipes/_master_base_config.rb @@ -28,9 +28,6 @@ # Get VPC CIDR node.default['cfncluster']['ec2-metadata']['vpc-ipv4-cidr-block'] = get_vpc_ipv4_cidr_block(node['macaddress']) -# Mount EFS directory with efs_mount recipe -include_recipe 'aws-parallelcluster::efs_mount' - # Parse shared directory info and turn into an array shared_dir_array = node['cfncluster']['cfn_shared_dir'].split(',') shared_dir_array.each_with_index do |dir, index| diff --git a/recipes/_master_sge_config.rb b/recipes/_master_sge_config.rb index 47b12ece4f..f45d287b38 100644 --- a/recipes/_master_sge_config.rb +++ b/recipes/_master_sge_config.rb @@ -73,3 +73,18 @@ /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_FORCED_QDEL global ENABLEFORCEDQDEL end + +# max_unheard: host is set to unknown after being unresponsive for the configured timeout +# reschedule_unknown: jobs on hosts in an unknown state are rescheduled/deleted after the configured timeout +# ENABLE_FORCED_QDEL_IF_UNKNOWN: force deletion on qdel command for hosts in unknown state +# ENABLE_RESCHEDULE_KILL: reschedule_unknown parameter affects also jobs which have the rerun flag not activated +bash "configure_unknown_hosts_behaviour" do + code <<-CONFIGUNKNOWN + set -e + . /opt/sge/default/common/settings.sh + /opt/sge/util/qconf_mod_attr -mconf max_unheard 00:03:00 global + /opt/sge/util/qconf_mod_attr -mconf reschedule_unknown 00:00:30 global + /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN global + /opt/sge/util/qconf_add_list_value -mconf qmaster_params ENABLE_RESCHEDULE_KILL=1 global + CONFIGUNKNOWN +end diff --git a/recipes/base_config.rb b/recipes/base_config.rb index 9f67ab5f77..eeec359284 100644 --- a/recipes/base_config.rb +++ b/recipes/base_config.rb @@ -53,14 +53,17 @@ mode '0644' end -# Restart supervisord -service "supervisord" do - supports restart: true - action %i[enable start] -end +# Mount EFS directory with efs_mount recipe +include_recipe 'aws-parallelcluster::efs_mount' + +# Mount FSx directory with fsx_mount recipe +include_recipe 'aws-parallelcluster::fsx_mount' -# Only run FSx on centos for now -if node['platform'] == 'centos' or node['platform'] == 'amazon' - # Mount FSx - include_recipe 'aws-parallelcluster::fsx_mount' +# Enable EFA +if node['cfncluster']['enable_efa'] == 'compute' && node['cfncluster']['cfn_node_type'] == 'ComputeFleet' + include_recipe "aws-parallelcluster::_efa_enable" +elsif node['cfncluster']['enable_efa'] == 'compute' + user_ulimit "*" do + memory_limit 'unlimited' + end end diff --git a/recipes/base_install.rb b/recipes/base_install.rb index 27d348104e..9d37067544 100644 --- a/recipes/base_install.rb +++ b/recipes/base_install.rb @@ -235,3 +235,17 @@ # Install FSx options include_recipe "aws-parallelcluster::_lustre_install" + +# Install EFA +if (node['platform'] == 'centos' && node['platform_version'].to_i >= 7) || node['platform'] == 'amazon' || (node['platform'] == 'ubuntu' && node['platform_version'] == "16.04") + unless node['cfncluster']['cfn_region'].start_with?("cn-") + include_recipe "aws-parallelcluster::_efa_install" + else + case node['platform_family'] + when 'rhel', 'amazon' + package %w[openmpi-devel openmpi] + when 'debian' + package "libopenmpi-dev" + end + end +end diff --git a/recipes/finalize.rb b/recipes/finalize.rb new file mode 100644 index 0000000000..28504afea9 --- /dev/null +++ b/recipes/finalize.rb @@ -0,0 +1,26 @@ +# +# Cookbook Name:: aws-parallelcluster +# Recipe:: finalize +# +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +# Restart supervisord +service "supervisord" do + supports restart: true + action %i[enable start] +end + +execute "compute_ready" do + command "/opt/parallelcluster/scripts/compute_ready" + environment('PATH' => '/usr/local/bin:/usr/bin/:$PATH') + only_if { node['cfncluster']['cfn_node_type'] == 'ComputeFleet' } +end diff --git a/recipes/sge_config.rb b/recipes/sge_config.rb index 233497cf13..eb48a64656 100644 --- a/recipes/sge_config.rb +++ b/recipes/sge_config.rb @@ -16,7 +16,6 @@ include_recipe 'aws-parallelcluster::base_config' include_recipe 'aws-parallelcluster::sge_install' -# case node['cfncluster']['cfn_node_type'] case node['cfncluster']['cfn_node_type'] when 'MasterServer' include_recipe 'aws-parallelcluster::_master_sge_config' diff --git a/recipes/sge_install.rb b/recipes/sge_install.rb index 929420f048..86973a7065 100644 --- a/recipes/sge_install.rb +++ b/recipes/sge_install.rb @@ -15,92 +15,117 @@ include_recipe 'aws-parallelcluster::base_install' -sge_tarball = "#{node['cfncluster']['sources_dir']}/sge-#{node['cfncluster']['sge']['version']}.tar.gz" +case node['cfncluster']['cfn_node_type'] +when 'MasterServer', nil + sge_tarball = "#{node['cfncluster']['sources_dir']}/sge-#{node['cfncluster']['sge']['version']}.tar.gz" -# Get SGE tarball -remote_file sge_tarball do - source node['cfncluster']['sge']['url'] - mode '0644' - retries 3 - retry_delay 5 - # TODO: Add version or checksum checks - not_if { ::File.exist?(sge_tarball) } -end + # Get SGE tarball + remote_file sge_tarball do + source node['cfncluster']['sge']['url'] + mode '0644' + retries 3 + retry_delay 5 + # TODO: Add version or checksum checks + not_if { ::File.exist?(sge_tarball) } + end -# Install SGE -bash 'make install' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - environment 'SGE_ROOT' => '/opt/sge' - code <<-SGE - tar xf #{sge_tarball} - cd sge-#{node['cfncluster']['sge']['version']}/source - CORES=$(grep processor /proc/cpuinfo | wc -l) - sh scripts/bootstrap.sh -no-java -no-jni -no-herd - ./aimk -pam -no-remote -no-java -no-jni -no-herd -parallel $CORES - ./aimk -man -no-java -no-jni -no-herd -parallel $CORES - scripts/distinst -local -allall -noexit - mkdir $SGE_ROOT - echo instremote=false >> distinst.private - gearch=`dist/util/arch` - echo 'y'| scripts/distinst -local -allall ${gearch} - SGE - # TODO: Fix, so it works for upgrade - creates '/opt/sge/bin/lx-amd64/sge_qmaster' -end + # Install SGE + bash 'make install' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + environment 'SGE_ROOT' => '/opt/sge' + code <<-SGE + tar xf #{sge_tarball} + cd sge-#{node['cfncluster']['sge']['version']}/source + CORES=$(grep processor /proc/cpuinfo | wc -l) + sh scripts/bootstrap.sh -no-java -no-jni -no-herd + ./aimk -pam -no-remote -no-java -no-jni -no-herd -parallel $CORES + ./aimk -man -no-java -no-jni -no-herd -parallel $CORES + scripts/distinst -local -allall -noexit + mkdir $SGE_ROOT + echo instremote=false >> distinst.private + gearch=`dist/util/arch` + echo 'y'| scripts/distinst -local -allall ${gearch} + SGE + # TODO: Fix, so it works for upgrade + creates '/opt/sge/bin/lx-amd64/sge_qmaster' + end -# Copy qconf utils (Downloaded from http://arc.liv.ac.uk/SGE/downloads/qconf_scripts.tar.gz) -cookbook_file 'qconf_scripts.tar.gz' do - path '/opt/sge/util/qconf_scripts.tar.gz' - user 'root' - group 'root' - mode '0644' -end + # Copy qconf utils (Downloaded from http://arc.liv.ac.uk/SGE/downloads/qconf_scripts.tar.gz) + cookbook_file 'qconf_scripts.tar.gz' do + path '/opt/sge/util/qconf_scripts.tar.gz' + user 'root' + group 'root' + mode '0644' + end -bash "extract_qconf_util" do - code <<-EXTRACTQCONFUTIL - tar xf /opt/sge/util/qconf_scripts.tar.gz -C /opt/sge/util --strip-components=1 --no-same-permissions --no-same-owner - EXTRACTQCONFUTIL -end + bash "extract_qconf_util" do + code <<-EXTRACTQCONFUTIL + set -e + tar xf /opt/sge/util/qconf_scripts.tar.gz -C /opt/sge/util --strip-components=1 --no-same-permissions --no-same-owner + # applying small patch for a bug in sge_edit_mod_attr script + # [[]] is incompatible with dash which is the default sh in ubuntu + sed -i 's/if \\[\\[ $cc -eq 0 ]]/if [ $cc -eq 0 ]/g' /opt/sge/util/sge_edit_mod_attr + EXTRACTQCONFUTIL + creates '/opt/sge/util/sge_edit_mod_attr' + end -# Disbale the AddQueue, so that we can manage slots per instance -replace_or_add "AddQueue" do - path "/opt/sge/inst_sge" - pattern "AddQueue" - line "#AddQueue" -end + # Disbale the AddQueue, so that we can manage slots per instance + replace_or_add "AddQueue" do + path "/opt/sge/inst_sge" + pattern "AddQueue" + line "#AddQueue" + end -# Only on CentOS/RHEL7 update the initd -if node['platform_family'] == 'rhel' && node['platform_version'].to_i >= 7 && node['platform'] != 'amazon' - execute 'sed' do - command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgemaster_template' + # Only on CentOS/RHEL7 update the initd + if node['platform_family'] == 'rhel' && node['platform_version'].to_i >= 7 && node['platform'] != 'amazon' + execute 'sed' do + command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgemaster_template' + end + execute 'sed' do + command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgeexecd_template' + end end - execute 'sed' do - command 'sed -i s/remote_fs/local_fs/g /opt/sge/util/rctemplates/sgeexecd_template' + + # Setup sgeadmin user + user "sgeadmin" do + manage_home true + comment 'sgeadmin user' + home "/home/sgeadmin" + system true + shell '/bin/bash' end -end -# Setup sgeadmin user -user "sgeadmin" do - manage_home true - comment 'sgeadmin user' - home "/home/sgeadmin" - system true - shell '/bin/bash' -end + # Copy required licensing files + directory "#{node['cfncluster']['license_dir']}/sge" -# Copy required licensing files -directory "#{node['cfncluster']['license_dir']}/sge" + bash 'copy license stuff' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SGELICENSE + cd sge-#{node['cfncluster']['sge']['version']}/LICENCES + cp -v SISSL #{node['cfncluster']['license_dir']}/sge/SISSL + SGELICENSE + # TODO: Fix, so it works for upgrade + creates "#{node['cfncluster']['license_dir']}/sge/SISSL" + end +when 'ComputeFleet' + # Created SGE shared mount point + directory "/opt/sge" do + mode '1777' + owner 'root' + group 'root' + action :create + end -bash 'copy license stuff' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SGELICENSE - cd sge-#{node['cfncluster']['sge']['version']}/LICENCES - cp -v SISSL #{node['cfncluster']['license_dir']}/sge/SISSL - SGELICENSE - # TODO: Fix, so it works for upgrade - creates "#{node['cfncluster']['license_dir']}/sge/SISSL" + # Setup sgeadmin user without creating the home (mounted from master) + user "sgeadmin" do + manage_home false + comment 'sgeadmin user' + home "/home/sgeadmin" + system true + shell '/bin/bash' + end end diff --git a/recipes/slurm_config.rb b/recipes/slurm_config.rb index 5ae6499ff7..6786bc9154 100644 --- a/recipes/slurm_config.rb +++ b/recipes/slurm_config.rb @@ -37,7 +37,6 @@ only_if { node['init_package'] != 'systemd' } end -# case node['cfncluster']['cfn_node_type'] case node['cfncluster']['cfn_node_type'] when 'MasterServer' include_recipe 'aws-parallelcluster::_master_slurm_config' diff --git a/recipes/slurm_install.rb b/recipes/slurm_install.rb index d8abad4457..a0720c7e94 100644 --- a/recipes/slurm_install.rb +++ b/recipes/slurm_install.rb @@ -16,42 +16,81 @@ include_recipe 'aws-parallelcluster::base_install' include_recipe 'aws-parallelcluster::munge_install' -slurm_tarball = "#{node['cfncluster']['sources_dir']}/slurm-#{node['cfncluster']['slurm']['version']}.tar.gz" +case node['cfncluster']['cfn_node_type'] +when 'MasterServer', nil + slurm_tarball = "#{node['cfncluster']['sources_dir']}/slurm-#{node['cfncluster']['slurm']['version']}.tar.gz" -# Get slurm tarball -remote_file slurm_tarball do - source node['cfncluster']['slurm']['url'] - mode '0644' - retries 3 - retry_delay 5 - # TODO: Add version or checksum checks - not_if { ::File.exist?(slurm_tarball) } -end + # Get slurm tarball + remote_file slurm_tarball do + source node['cfncluster']['slurm']['url'] + mode '0644' + retries 3 + retry_delay 5 + # TODO: Add version or checksum checks + not_if { ::File.exist?(slurm_tarball) } + end -# Install Slurm -bash 'make install' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SLURM - tar xf #{slurm_tarball} - cd slurm-slurm-#{node['cfncluster']['slurm']['version']} - ./configure --prefix=/opt/slurm - CORES=$(grep processor /proc/cpuinfo | wc -l) - make -j $CORES - make install - SLURM - # TODO: Fix, so it works for upgrade - creates '/opt/slurm/bin/srun' -end + # Install Slurm + bash 'make install' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SLURM + tar xf #{slurm_tarball} + cd slurm-slurm-#{node['cfncluster']['slurm']['version']} + ./configure --prefix=/opt/slurm + CORES=$(grep processor /proc/cpuinfo | wc -l) + make -j $CORES + make install + make install-contrib + SLURM + # TODO: Fix, so it works for upgrade + creates '/opt/slurm/bin/srun' + end + + # Setup slurm user + user "slurm" do + manage_home true + comment 'slurm user' + home "/home/slurm" + system true + shell '/bin/bash' + end + + # Copy required licensing files + directory "#{node['cfncluster']['license_dir']}/slurm" -# Setup slurm user -user "slurm" do - manage_home true - comment 'slurm user' - home "/home/slurm" - system true - shell '/bin/bash' + bash 'copy license stuff' do + user 'root' + group 'root' + cwd Chef::Config[:file_cache_path] + code <<-SLURMLICENSE + cd slurm-slurm-#{node['cfncluster']['slurm']['version']} + cp -v COPYING #{node['cfncluster']['license_dir']}/slurm/COPYING + cp -v DISCLAIMER #{node['cfncluster']['license_dir']}/slurm/DISCLAIMER + cp -v LICENSE.OpenSSL #{node['cfncluster']['license_dir']}/slurm/LICENSE.OpenSSL + cp -v README.rst #{node['cfncluster']['license_dir']}/slurm/README.rst + SLURMLICENSE + # TODO: Fix, so it works for upgrade + creates "#{node['cfncluster']['license_dir']}/slurm/README.rst" + end +when 'ComputeFleet' + # Created Slurm shared mount point + directory "/opt/slurm" do + mode '1777' + owner 'root' + group 'root' + action :create + end + + # Setup slurm user without creating the home (mounted from master) + user "slurm" do + manage_home false + comment 'slurm user' + home "/home/slurm" + system true + shell '/bin/bash' + end end cookbook_file '/etc/init.d/slurm' do @@ -62,21 +101,3 @@ action :create only_if { node['platform_family'] == 'debian' && !node['init_package'] == 'systemd' } end - -# Copy required licensing files -directory "#{node['cfncluster']['license_dir']}/slurm" - -bash 'copy license stuff' do - user 'root' - group 'root' - cwd Chef::Config[:file_cache_path] - code <<-SLURMLICENSE - cd slurm-slurm-#{node['cfncluster']['slurm']['version']} - cp -v COPYING #{node['cfncluster']['license_dir']}/slurm/COPYING - cp -v DISCLAIMER #{node['cfncluster']['license_dir']}/slurm/DISCLAIMER - cp -v LICENSE.OpenSSL #{node['cfncluster']['license_dir']}/slurm/LICENSE.OpenSSL - cp -v README.rst #{node['cfncluster']['license_dir']}/slurm/README.rst - SLURMLICENSE - # TODO: Fix, so it works for upgrade - creates "#{node['cfncluster']['license_dir']}/slurm/README.rst" -end diff --git a/recipes/tests.rb b/recipes/tests.rb index 239c4bf259..2fe2b5c0ec 100644 --- a/recipes/tests.rb +++ b/recipes/tests.rb @@ -31,6 +31,13 @@ AWSREGIONS end +unless node['cfncluster']['os'].end_with?("-custom") + bash 'test soft ulimit nofile' do + code "if (($(ulimit -Sn) < 10000)); then exit 1; fi" + user node['cfncluster']['cfn_cluster_user'] + end +end + if node['cfncluster']['cfn_scheduler'] == 'sge' case node['cfncluster']['cfn_node_type'] when 'MasterServer' @@ -99,13 +106,30 @@ end end -bash 'execute jq' do - cwd Chef::Config[:file_cache_path] - code <<-JQMERGE - # Set PATH as in the UserData script of the CloudFormation template - export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" - echo '{"cfncluster": {"cfn_region": "eu-west-3"}, "run_list": "recipe[aws-parallelcluster::sge_config]"}' > /tmp/dna.json - echo '{ "cfncluster" : { "ganglia_enabled" : "yes" } }' > /tmp/extra.json - jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' || exit 1 - JQMERGE +unless node['cfncluster']['cfn_region'].start_with?("cn-") + case node['cfncluster']['os'] + when 'alinux', 'centos7' + execute 'check efa rpm installed' do + command "rpm -qa | grep libfabric && rpm -qa | grep efa-" + user node['cfncluster']['cfn_cluster_user'] + end + when 'ubuntu1604' + execute 'check efa rpm installed' do + command "dpkg -l | grep libfabric && dpkg -l | grep 'efa '" + user node['cfncluster']['cfn_cluster_user'] + end + end +end + +unless node['cfncluster']['os'].end_with?("-custom") + bash 'execute jq' do + cwd Chef::Config[:file_cache_path] + code <<-JQMERGE + # Set PATH as in the UserData script of the CloudFormation template + export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" + echo '{"cfncluster": {"cfn_region": "eu-west-3"}, "run_list": "recipe[aws-parallelcluster::sge_config]"}' > /tmp/dna.json + echo '{ "cfncluster" : { "ganglia_enabled" : "yes" } }' > /tmp/extra.json + jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' || exit 1 + JQMERGE + end end diff --git a/templates/default/cfnconfig.erb b/templates/default/cfnconfig.erb index 6c75330bf1..1c9c55de69 100644 --- a/templates/default/cfnconfig.erb +++ b/templates/default/cfnconfig.erb @@ -2,7 +2,7 @@ stack_name=<%= node['cfncluster']['stack_name'] %> cfn_preinstall=<%= node['cfncluster']['cfn_preinstall'] %> cfn_preinstall_args=<%= node['cfncluster']['cfn_preinstall_args'] %> cfn_postinstall=<%= node['cfncluster']['cfn_postinstall'] %> -cfn_postinstall_args="<%= node['cfncluster']['cfn_postinstall_args'] %>" +cfn_postinstall_args=<%= node['cfncluster']['cfn_postinstall_args'] %> cfn_region=<%= node['cfncluster']['cfn_region'] %> cfn_scheduler=<%= node['cfncluster']['cfn_scheduler'] %> cfn_scheduler_slots=<%= node['cfncluster']['cfn_scheduler_slots'] %> diff --git a/templates/default/jobwatcher.cfg.erb b/templates/default/jobwatcher.cfg.erb index 84b7f25330..16f9c54d12 100644 --- a/templates/default/jobwatcher.cfg.erb +++ b/templates/default/jobwatcher.cfg.erb @@ -4,4 +4,3 @@ scheduler = <%= node['cfncluster']['cfn_scheduler'] %> stack_name = <%= node['cfncluster']['stack_name'] %> cfncluster_dir = <%= node['cfncluster']['base_dir'] %> proxy = <%= node['cfncluster']['cfn_proxy'] %> -compute_instance_type = <%= node['cfncluster']['compute_instance_type'] %> diff --git a/templates/default/slurm.conf.erb b/templates/default/slurm.conf.erb index ad049832a5..9bdee0e513 100644 --- a/templates/default/slurm.conf.erb +++ b/templates/default/slurm.conf.erb @@ -55,7 +55,7 @@ ReturnToService=1 # # TIMERS SlurmctldTimeout=300 -SlurmdTimeout=300 +SlurmdTimeout=120 InactiveLimit=0 MinJobAge=300 KillWait=30 diff --git a/templates/default/sqswatcher.cfg.erb b/templates/default/sqswatcher.cfg.erb index 5e1b6b84bf..bc2e6e37b2 100644 --- a/templates/default/sqswatcher.cfg.erb +++ b/templates/default/sqswatcher.cfg.erb @@ -5,5 +5,4 @@ table_name = <%= node['cfncluster']['cfn_ddb_table'] %> scheduler = <%= node['cfncluster']['cfn_scheduler'] %> cluster_user = <%= node['cfncluster']['cfn_cluster_user'] %> proxy = <%= node['cfncluster']['cfn_proxy'] %> -max_queue_size = <%= node['cfncluster']['cfn_max_queue_size'] %> stack_name = <%= node['cfncluster']['stack_name'] %> diff --git a/util/bump-version.sh b/util/bump-version.sh index e02c838bb0..a832a17900 100755 --- a/util/bump-version.sh +++ b/util/bump-version.sh @@ -10,7 +10,7 @@ fi NEW_VERSION=$1 CURRENT_VERSION=$(sed -ne "s/^version '\(.*\)'/\1/p" metadata.rb) -sed -i -e "s/\(.*parallelcluster.*version.*\)$CURRENT_VERSION\(.*\)/\1$NEW_VERSION\2/g" amis/packer_variables.json +sed -i -e "s/\(.*parallelcluster.*version.*\)$CURRENT_VERSION.*\(\".*\)/\1$NEW_VERSION\2/g" amis/packer_variables.json sed -i "s/default\['cfncluster'\]\['cfncluster-version'\] = '$CURRENT_VERSION'/default['cfncluster']['cfncluster-version'] = '$NEW_VERSION'/g" attributes/default.rb sed -i "s/default\['cfncluster'\]\['cfncluster-node-version'\] = '$CURRENT_VERSION'/default['cfncluster']['cfncluster-node-version'] = '$NEW_VERSION'/g" attributes/default.rb sed -i "s/version '$CURRENT_VERSION'/version '$NEW_VERSION'/g" metadata.rb diff --git a/util/chef-install.sh b/util/chef-install.sh new file mode 100644 index 0000000000..c7347a08f9 --- /dev/null +++ b/util/chef-install.sh @@ -0,0 +1,793 @@ +#!/bin/sh +# WARNING: REQUIRES /bin/sh +# +# - must run on /bin/sh on solaris 9 +# - must run on /bin/sh on AIX 6.x +# +# Copyright:: Copyright (c) 2010-2015 Chef Software, Inc. +# License:: Apache License, Version 2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# helpers.sh +############ +# This section has some helper functions to make life easier. +# +# Outputs: +# $tmp_dir: secure-ish temp directory that can be used during installation. +############ + +# Check whether a command exists - returns 0 if it does, 1 if it does not +exists() { + if command -v $1 >/dev/null 2>&1 + then + return 0 + else + return 1 + fi +} + +# Output the instructions to report bug about this script +report_bug() { + echo "Version: $version" + echo "" + echo "Please file a Bug Report at https://github.com/chef/omnitruck/issues/new" + echo "Alternatively, feel free to open a Support Ticket at https://www.chef.io/support/tickets" + echo "More Chef support resources can be found at https://www.chef.io/support" + echo "" + echo "Please include as many details about the problem as possible i.e., how to reproduce" + echo "the problem (if possible), type of the Operating System and its version, etc.," + echo "and any other relevant details that might help us with troubleshooting." + echo "" +} + +checksum_mismatch() { + echo "Package checksum mismatch!" + report_bug + exit 1 +} + +unable_to_retrieve_package() { + echo "Unable to retrieve a valid package!" + report_bug + echo "Metadata URL: $metadata_url" + if test "x$download_url" != "x"; then + echo "Download URL: $download_url" + fi + if test "x$stderr_results" != "x"; then + echo "\nDEBUG OUTPUT FOLLOWS:\n$stderr_results" + fi + exit 1 +} + +http_404_error() { + echo "Omnitruck artifact does not exist for version $version on platform $platform" + echo "" + echo "Either this means:" + echo " - We do not support $platform" + echo " - We do not have an artifact for $version" + echo "" + echo "This is often the latter case due to running a prerelease or RC version of chef" + echo "or a gem version which was only pushed to rubygems and not omnitruck." + echo "" + echo "You may be able to set your knife[:bootstrap_version] to the most recent stable" + echo "release of Chef to fix this problem (or the most recent stable major version number)." + echo "" + echo "In order to test the version parameter, adventurous users may take the Metadata URL" + echo "below and modify the '&v=' parameter until you successfully get a URL that" + echo "does not 404 (e.g. via curl or wget). You should be able to use '&v=11' or '&v=12'" + echo "succesfully." + echo "" + echo "If you cannot fix this problem by setting the bootstrap_version, it probably means" + echo "that $platform is not supported." + echo "" + # deliberately do not call report_bug to suppress bug report noise. + echo "Metadata URL: $metadata_url" + if test "x$download_url" != "x"; then + echo "Download URL: $download_url" + fi + if test "x$stderr_results" != "x"; then + echo "\nDEBUG OUTPUT FOLLOWS:\n$stderr_results" + fi + exit 1 +} + +capture_tmp_stderr() { + # spool up /tmp/stderr from all the commands we called + if test -f "$tmp_dir/stderr"; then + output=`cat $tmp_dir/stderr` + stderr_results="${stderr_results}\nSTDERR from $1:\n\n$output\n" + rm $tmp_dir/stderr + fi +} + +# do_wget URL FILENAME +do_wget() { + echo "trying wget..." + wget --user-agent="User-Agent: mixlib-install/3.11.5" -O "$2" "$1" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "ERROR 404" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "wget" + return 1 + fi + + return 0 +} + +# do_curl URL FILENAME +do_curl() { + echo "trying curl..." + curl -A "User-Agent: mixlib-install/3.11.5" --retry 5 -sL -D $tmp_dir/stderr "$1" > "$2" + rc=$? + # check for 404 + grep "404 Not Found" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "curl" + return 1 + fi + + return 0 +} + +# do_fetch URL FILENAME +do_fetch() { + echo "trying fetch..." + fetch --user-agent="User-Agent: mixlib-install/3.11.5" -o "$2" "$1" 2>$tmp_dir/stderr + # check for bad return status + test $? -ne 0 && return 1 + return 0 +} + +# do_perl URL FILENAME +do_perl() { + echo "trying perl..." + perl -e 'use LWP::Simple; getprint($ARGV[0]);' "$1" > "$2" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "404 Not Found" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "perl" + return 1 + fi + + return 0 +} + +# do_python URL FILENAME +do_python() { + echo "trying python..." + python -c "import sys,urllib2; sys.stdout.write(urllib2.urlopen(urllib2.Request(sys.argv[1], headers={ 'User-Agent': 'mixlib-install/3.11.5' })).read())" "$1" > "$2" 2>$tmp_dir/stderr + rc=$? + # check for 404 + grep "HTTP Error 404" $tmp_dir/stderr 2>&1 >/dev/null + if test $? -eq 0; then + echo "ERROR 404" + http_404_error + fi + + # check for bad return status or empty output + if test $rc -ne 0 || test ! -s "$2"; then + capture_tmp_stderr "python" + return 1 + fi + return 0 +} + +# returns 0 if checksums match +do_checksum() { + if exists sha256sum; then + echo "Comparing checksum with sha256sum..." + checksum=`sha256sum $1 | awk '{ print $1 }'` + return `test "x$checksum" = "x$2"` + elif exists shasum; then + echo "Comparing checksum with shasum..." + checksum=`shasum -a 256 $1 | awk '{ print $1 }'` + return `test "x$checksum" = "x$2"` + else + echo "WARNING: could not find a valid checksum program, pre-install shasum or sha256sum in your O/S image to get valdation..." + return 0 + fi +} + +# do_download URL FILENAME +do_download() { + echo "downloading $1" + echo " to file $2" + + url=`echo $1` + if test "x$platform" = "xsolaris2"; then + if test "x$platform_version" = "x5.9" -o "x$platform_version" = "x5.10"; then + # solaris 9 lacks openssl, solaris 10 lacks recent enough credentials - your base O/S is completely insecure, please upgrade + url=`echo $url | sed -e 's/https/http/'` + fi + fi + + # we try all of these until we get success. + # perl, in particular may be present but LWP::Simple may not be installed + + if exists wget; then + do_wget $url $2 && return 0 + fi + + if exists curl; then + do_curl $url $2 && return 0 + fi + + if exists fetch; then + do_fetch $url $2 && return 0 + fi + + if exists perl; then + do_perl $url $2 && return 0 + fi + + if exists python; then + do_python $url $2 && return 0 + fi + + unable_to_retrieve_package +} + +# install_file TYPE FILENAME +# TYPE is "rpm", "deb", "solaris", "sh", etc. +install_file() { + echo "Installing $project $version" + case "$1" in + "rpm") + if test "x$platform" = "xnexus" || test "x$platform" = "xios_xr"; then + echo "installing with yum..." + yum install -yv "$2" + else + echo "installing with rpm..." + rpm -Uvh --oldpackage --replacepkgs "$2" + fi + ;; + "deb") + echo "installing with dpkg..." + until dpkg -i "$2"; do + echo "Retrying dpkg -i $2 ..." + sleep 1 + done + ;; + "bff") + echo "installing with installp..." + installp -aXYgd "$2" all + ;; + "solaris") + echo "installing with pkgadd..." + echo "conflict=nocheck" > $tmp_dir/nocheck + echo "action=nocheck" >> $tmp_dir/nocheck + echo "mail=" >> $tmp_dir/nocheck + pkgrm -a $tmp_dir/nocheck -n $project >/dev/null 2>&1 || true + pkgadd -G -n -d "$2" -a $tmp_dir/nocheck $project + ;; + "pkg") + echo "installing with installer..." + cd / && /usr/sbin/installer -pkg "$2" -target / + ;; + "dmg") + echo "installing dmg file..." + hdiutil detach "/Volumes/chef_software" >/dev/null 2>&1 || true + hdiutil attach "$2" -mountpoint "/Volumes/chef_software" + cd / && /usr/sbin/installer -pkg `find "/Volumes/chef_software" -name \*.pkg` -target / + hdiutil detach "/Volumes/chef_software" + ;; + "sh" ) + echo "installing with sh..." + sh "$2" + ;; + "p5p" ) + echo "installing p5p package..." + pkg install -g "$2" $project + ;; + *) + echo "Unknown filetype: $1" + report_bug + exit 1 + ;; + esac + if test $? -ne 0; then + echo "Installation failed" + report_bug + exit 1 + fi +} + +if test "x$TMPDIR" = "x"; then + tmp="/tmp" +else + tmp=$TMPDIR +fi +# secure-ish temp dir creation without having mktemp available (DDoS-able but not expliotable) +tmp_dir="$tmp/install.sh.$$" +(umask 077 && mkdir $tmp_dir) || exit 1 + +############ +# end of helpers.sh +############ + + +# script_cli_parameters.sh +############ +# This section reads the CLI parameters for the install script and translates +# them to the local parameters to be used later by the script. +# +# Outputs: +# $version: Requested version to be installed. +# $channel: Channel to install the product from +# $project: Project to be installed +# $cmdline_filename: Name of the package downloaded on local disk. +# $cmdline_dl_dir: Name of the directory downloaded package will be saved to on local disk. +# $install_strategy: Method of package installations. default strategy is to always install upon exec. Set to "once" to skip if project is installed +# $download_url_override: Install package downloaded from a direct URL. +# $checksum: SHA256 for download_url_override file (optional) +############ + +# Defaults +channel="stable" +project="chef" + +while getopts pnv:c:f:P:d:s:l:a opt +do + case "$opt" in + + v) version="$OPTARG";; + c) channel="$OPTARG";; + p) channel="current";; # compat for prerelease option + n) channel="current";; # compat for nightlies option + f) cmdline_filename="$OPTARG";; + P) project="$OPTARG";; + d) cmdline_dl_dir="$OPTARG";; + s) install_strategy="$OPTARG";; + l) download_url_override="$OPTARG";; + a) checksum="$OPTARG";; + \?) # unknown flag + echo >&2 \ + "usage: $0 [-P project] [-c release_channel] [-v version] [-f filename | -d download_dir] [-s install_strategy] [-l download_url_override] [-a checksum]" + exit 1;; + esac +done + +shift `expr $OPTIND - 1` + + +if test -d "/opt/$project" && test "x$install_strategy" = "xonce"; then + echo "$project installation detected" + echo "install_strategy set to 'once'" + echo "Nothing to install" + exit +fi + + +# platform_detection.sh +############ +# This section makes platform detection compatible with omnitruck on the system +# it runs. +# +# Outputs: +# $platform: Name of the platform. +# $platform_version: Version of the platform. +# $machine: System's architecture. +############ + +# +# Platform and Platform Version detection +# +# NOTE: This should now match ohai platform and platform_version matching. +# do not invented new platform and platform_version schemas, just make this behave +# like what ohai returns as platform and platform_version for the server. +# +# ALSO NOTE: Do not mangle platform or platform_version here. It is less error +# prone and more future-proof to do that in the server, and then all omnitruck clients +# will 'inherit' the changes (install.sh is not the only client of the omnitruck +# endpoint out there). +# + +machine=`uname -m` +os=`uname -s` + +if test -f "/etc/lsb-release" && grep -q DISTRIB_ID /etc/lsb-release && ! grep -q wrlinux /etc/lsb-release; then + platform=`grep DISTRIB_ID /etc/lsb-release | cut -d "=" -f 2 | tr '[A-Z]' '[a-z]'` + platform_version=`grep DISTRIB_RELEASE /etc/lsb-release | cut -d "=" -f 2` + + if test "$platform" = "\"cumulus linux\""; then + platform="cumulus_linux" + elif test "$platform" = "\"cumulus networks\""; then + platform="cumulus_networks" + fi + +elif test -f "/etc/debian_version"; then + platform="debian" + platform_version=`cat /etc/debian_version` +elif test -f "/etc/Eos-release"; then + # EOS may also contain /etc/redhat-release so this check must come first. + platform=arista_eos + platform_version=`awk '{print $4}' /etc/Eos-release` + machine="i386" +elif test -f "/etc/redhat-release"; then + platform=`sed 's/^\(.\+\) release.*/\1/' /etc/redhat-release | tr '[A-Z]' '[a-z]'` + platform_version=`sed 's/^.\+ release \([.0-9]\+\).*/\1/' /etc/redhat-release` + + if test "$platform" = "xenserver"; then + # Current XenServer 6.2 is based on CentOS 5, platform is not reset to "el" server should hanlde response + platform="xenserver" + else + # FIXME: use "redhat" + platform="el" + fi + +elif test -f "/etc/system-release"; then + platform=`sed 's/^\(.\+\) release.\+/\1/' /etc/system-release | tr '[A-Z]' '[a-z]'` + platform_version=`sed 's/^.\+ release \([.0-9]\+\).*/\1/' /etc/system-release | tr '[A-Z]' '[a-z]'` + case $platform in amazon*) # sh compat method of checking for a substring + platform="el" + + . /etc/os-release + platform_version=$VERSION_ID + if test "$platform_version" = "2"; then + platform_version="7" + else + # VERSION_ID will match YYYY.MM for Amazon Linux AMIs + platform_version="6" + fi + esac + +# Apple OS X +elif test -f "/usr/bin/sw_vers"; then + platform="mac_os_x" + # Matching the tab-space with sed is error-prone + platform_version=`sw_vers | awk '/^ProductVersion:/ { print $2 }' | cut -d. -f1,2` + + # x86_64 Apple hardware often runs 32-bit kernels (see OHAI-63) + x86_64=`sysctl -n hw.optional.x86_64` + if test $x86_64 -eq 1; then + machine="x86_64" + fi +elif test -f "/etc/release"; then + machine=`/usr/bin/uname -p` + if grep -q SmartOS /etc/release; then + platform="smartos" + platform_version=`grep ^Image /etc/product | awk '{ print $3 }'` + else + platform="solaris2" + platform_version=`/usr/bin/uname -r` + fi +elif test -f "/etc/SuSE-release"; then + if grep -q 'Enterprise' /etc/SuSE-release; + then + platform="sles" + platform_version=`awk '/^VERSION/ {V = $3}; /^PATCHLEVEL/ {P = $3}; END {print V "." P}' /etc/SuSE-release` + else + platform="suse" + platform_version=`awk '/^VERSION =/ { print $3 }' /etc/SuSE-release` + fi +elif test "x$os" = "xFreeBSD"; then + platform="freebsd" + platform_version=`uname -r | sed 's/-.*//'` +elif test "x$os" = "xAIX"; then + platform="aix" + platform_version="`uname -v`.`uname -r`" + machine="powerpc" +elif test -f "/etc/os-release"; then + . /etc/os-release + if test "x$CISCO_RELEASE_INFO" != "x"; then + . $CISCO_RELEASE_INFO + fi + + platform=$ID + platform_version=$VERSION +fi + +if test "x$platform" = "x"; then + echo "Unable to determine platform version!" + report_bug + exit 1 +fi + +# +# NOTE: platform manging in the install.sh is DEPRECATED +# +# - install.sh should be true to ohai and should not remap +# platform or platform versions. +# +# - remapping platform and mangling platform version numbers is +# now the complete responsibility of the server-side endpoints +# + +major_version=`echo $platform_version | cut -d. -f1` +case $platform in + # FIXME: should remove this case statement completely + "el") + # FIXME: "el" is deprecated, should use "redhat" + platform_version=$major_version + ;; + "debian") + if test "x$major_version" = "x5"; then + # This is here for potential back-compat. + # We do not have 5 in versions we publish for anymore but we + # might have it for earlier versions. + platform_version="6" + else + platform_version=$major_version + fi + ;; + "freebsd") + platform_version=$major_version + ;; + "sles") + platform_version=$major_version + ;; + "suse") + platform_version=$major_version + ;; +esac + +# normalize the architecture we detected +case $machine in + "x86_64"|"amd64"|"x64") + machine="x86_64" + ;; + "i386"|"i86pc"|"x86"|"i686") + machine="i386" + ;; + "sparc"|"sun4u"|"sun4v") + machine="sparc" + ;; +esac + +if test "x$platform_version" = "x"; then + echo "Unable to determine platform version!" + report_bug + exit 1 +fi + +if test "x$platform" = "xsolaris2"; then + # hack up the path on Solaris to find wget, pkgadd + PATH=/usr/sfw/bin:/usr/sbin:$PATH + export PATH +fi + +echo "$platform $platform_version $machine" + +############ +# end of platform_detection.sh +############ + + +# All of the download utilities in this script load common proxy env vars. +# If variables are set they will override any existing env vars. +# Otherwise, default proxy env vars will be loaded by the respective +# download utility. + +if test "x$https_proxy" != "x"; then + echo "setting https_proxy: $https_proxy" + export HTTPS_PROXY=$https_proxy + export https_proxy=$https_proxy +fi + +if test "x$http_proxy" != "x"; then + echo "setting http_proxy: $http_proxy" + export HTTP_PROXY=$http_proxy + export http_proxy=$http_proxy +fi + +if test "x$ftp_proxy" != "x"; then + echo "setting ftp_proxy: $ftp_proxy" + export FTP_PROXY=$ftp_proxy + export ftp_proxy=$ftp_proxy +fi + +if test "x$no_proxy" != "x"; then + echo "setting no_proxy: $no_proxy" + export NO_PROXY=$no_proxy + export no_proxy=$no_proxy +fi + + +# fetch_metadata.sh +############ +# This section calls omnitruck to get the information about the build to be +# installed. +# +# Inputs: +# $channel: +# $project: +# $version: +# $platform: +# $platform_version: +# $machine: +# $tmp_dir: +# +# Outputs: +# $download_url: +# $sha256: +############ + +if test "x$download_url_override" = "x"; then + echo "Getting information for $project $channel $version for $platform..." + + metadata_filename="$tmp_dir/metadata.txt" + metadata_url="https://www.chef.io/$channel/$project/metadata?v=$version&p=$platform&pv=$platform_version&m=$machine" + + do_download "$metadata_url" "$metadata_filename" + + cat "$metadata_filename" + + echo "" + # check that all the mandatory fields in the downloaded metadata are there + if grep '^url' $metadata_filename > /dev/null && grep '^sha256' $metadata_filename > /dev/null; then + echo "downloaded metadata file looks valid..." + else + echo "downloaded metadata file is corrupted or an uncaught error was encountered in downloading the file..." + # this generally means one of the download methods downloaded a 404 or something like that and then reported a successful exit code, + # and this should be fixed in the function that was doing the download. + report_bug + exit 1 + fi + + download_url=`awk '$1 == "url" { print $2 }' "$metadata_filename"` + sha256=`awk '$1 == "sha256" { print $2 }' "$metadata_filename"` +else + download_url=$download_url_override + # Set sha256 to empty string if checksum not set + sha256=${checksum=""} +fi + +############ +# end of fetch_metadata.sh +############ + + +# fetch_package.sh +############ +# This section fetchs a package from $download_url and verifies its metadata. +# +# Inputs: +# $download_url: +# $tmp_dir: +# Optional Inputs: +# $cmdline_filename: Name of the package downloaded on local disk. +# $cmdline_dl_dir: Name of the directory downloaded package will be saved to on local disk. +# +# Outputs: +# $download_filename: Name of the downloaded file on local disk. +# $filetype: Type of the file downloaded. +############ + +filename=`echo $download_url | sed -e 's/^.*\///'` +filetype=`echo $filename | sed -e 's/^.*\.//'` + +# use either $tmp_dir, the provided directory (-d) or the provided filename (-f) +if test "x$cmdline_filename" != "x"; then + download_filename="$cmdline_filename" +elif test "x$cmdline_dl_dir" != "x"; then + download_filename="$cmdline_dl_dir/$filename" +else + download_filename="$tmp_dir/$filename" +fi + +# ensure the parent directory where to download the installer always exists +download_dir=`dirname $download_filename` +(umask 077 && mkdir -p $download_dir) || exit 1 + +# check if we have that file locally available and if so verify the checksum +# Use cases +# 1) metadata - new download +# 2) metadata - cached download when cmdline_dl_dir set +# 3) url override - no checksum new download +# 4) url override - with checksum new download +# 5) url override - with checksum cached download when cmdline_dl_dir set + +cached_file_available="false" +verify_checksum="true" + +if test -f $download_filename; then + echo "$download_filename exists" + cached_file_available="true" +fi + +if test "x$download_url_override" != "x"; then + echo "Download URL override specified" + if test "x$cached_file_available" = "xtrue"; then + echo "Verifying local file" + if test "x$sha256" = "x"; then + echo "Checksum not specified, ignoring existing file" + cached_file_available="false" # download new file + verify_checksum="false" # no checksum to compare after download + elif do_checksum "$download_filename" "$sha256"; then + echo "Checksum match, using existing file" + cached_file_available="true" # don't need to download file + verify_checksum="false" # don't need to checksum again + else + echo "Checksum mismatch, ignoring existing file" + cached_file_available="false" # download new file + verify_checksum="true" # checksum new downloaded file + fi + else + echo "$download_filename not found" + cached_file_available="false" # download new file + if test "x$sha256" = "x"; then + verify_checksum="false" # no checksum to compare after download + else + verify_checksum="true" # checksum new downloaded file + fi + fi +fi + +if test "x$cached_file_available" != "xtrue"; then + do_download "$download_url" "$download_filename" +fi + +if test "x$verify_checksum" = "xtrue"; then + do_checksum "$download_filename" "$sha256" || checksum_mismatch +fi + +############ +# end of fetch_package.sh +############ + + +# install_package.sh +############ +# Installs a package and removed the temp directory. +# +# Inputs: +# $download_filename: Name of the file to be installed. +# $filetype: Type of the file to be installed. +# $version: The version requested. Used only for warning user if not set. +############ + +if test "x$version" = "x" -a "x$CI" != "xtrue"; then + echo + echo "WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING" + echo + echo "You are installing an omnibus package without a version pin. If you are installing" + echo "on production servers via an automated process this is DANGEROUS and you will" + echo "be upgraded without warning on new releases, even to new major releases." + echo "Letting the version float is only appropriate in desktop, test, development or" + echo "CI/CD environments." + echo + echo "WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING" + echo +fi + +install_file $filetype "$download_filename" + +if test "x$tmp_dir" != "x"; then + rm -r "$tmp_dir" +fi + +############ +# end of install_package.sh +############