diff --git a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb index 0cf4d4a05..305931d09 100644 --- a/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb +++ b/cookbooks/aws-parallelcluster-computefleet/templates/compute_fleet_status/is_fleet_ready.erb @@ -1,7 +1,16 @@ #!/bin/bash -sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') -while IFS= read -r line; do - nodelist=$(echo "$line" | awk '{print $1}') - <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } -done <<< "$sinfo_output" + + +cluster_static_node_count=$1 +if [[ -z "$cluster_static_node_count" ]]; then + cluster_static_node_count=1 +fi + +if [[ "$cluster_static_node_count" -ge "1" ]]; then + sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$') + while IFS= read -r line; do + nodelist=$(echo "$line" | awk '{print $1}') + <%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; } + done <<< "$sinfo_output" +fi \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index dc599d91a..088d1e654 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -178,6 +178,21 @@ def wait_cluster_ready end end +def get_static_node_count + require 'yaml' + cluster_config = YAML.safe_load(File.read(node['cluster']['cluster_config_path'])) + total_min_count = 0 + slurm_queues_section = cluster_config.dig("Scheduling", "SlurmQueues") + if slurm_queues_section + slurm_queues_section.each do |queue_config| + queue_config['ComputeResources'].each do |compute_resource_config| + total_min_count += compute_resource_config['MinCount'].to_i + end + end + end + total_min_count +end + def wait_static_fleet_running ruby_block "wait for static fleet capacity" do block do @@ -203,11 +218,15 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested fleet_status_command = Shellwords.escape( "/usr/local/bin/get-compute-fleet-status.sh" ) + + total_static_node_count = get_static_node_count + Chef::Log.info("Count of cluster static nodes is #{total_static_node_count}") + # Example output for sinfo # sinfo -h -o '%N %t' # queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~ # queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle - until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty? + until shell_out!("/bin/bash /usr/local/bin/is_fleet_ready.sh #{total_static_node_count}").stdout.strip.empty? check_for_protected_mode(fleet_status_command) Chef::Log.info("Waiting for static fleet capacity provisioning")