diff --git a/NEWS b/NEWS index b6b2eb9035671f582d690e55c6cdacfe1473a8a4..52a88ce15f6b13a19fd55e04ac6f2e3b89f83264 100644 --- a/NEWS +++ b/NEWS @@ -113,6 +113,12 @@ documents those changes that are of interest to users and administrators. -- Note when a job finishes in the slurmd to avoid a race when launching a batch job takes longer than it takes to finish. -- Improve slurmd startup on large systems (> 10000 nodes) + -- Add LaunchParameters option of cray_net_exclusive to control whether all + jobs on the cluster have exclusive access to their assigned nodes. + -- Make sure srun inside an allocation gets --ntasks-per-[core|socket] + set correctly. + -- Only make the extern step at job creation. + -- Fix for job step task layout with --cpus-per-task option. * Changes in Slurm 17.02.4 ========================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e14fdd822fb17e9fc3d46738bc478850f6084bcf..d22aaa8a24420c562134f37d322619baa8f461c7 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1272,6 +1272,12 @@ Identifies options to the job launch plugin. Acceptable values include: .RS .TP 24 +\fBcray_net_exclusive\fR +Allow jobs on a Cray Native cluster exclusive access to network resources. +This should only be set on clusters providing exclusive access to each +node to a single job at once, and not using parallel steps within the job, +otherwise resources on the node can be oversubscribed. +.TP 24 \fBmem_sort\fR Sort NUMA memory at step start. User can override this default with SLURM_MEM_BIND environment variable or \-\-mem_bind=nosort command line option. diff --git a/src/plugins/switch/cray/switch_cray.c b/src/plugins/switch/cray/switch_cray.c index 5a632e63850fed988e14504aebf3eba18676f3d7..63e30fd0549e88c31498b1d5e06b5fee3144cdf6 100644 --- a/src/plugins/switch/cray/switch_cray.c +++ b/src/plugins/switch/cray/switch_cray.c @@ -417,7 +417,8 @@ extern int switch_p_job_init(stepd_step_rec_t *job) #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK) slurm_cray_jobinfo_t *sw_job = (slurm_cray_jobinfo_t *) job->switch_job; int rc, num_ptags; - int mem_scaling, cpu_scaling; + char *launch_params; + int exclusive = 0, mem_scaling = 100, cpu_scaling = 100; int *ptags = NULL; char *err_msg = NULL; uint64_t cont_id = job->cont_id; @@ -485,29 +486,46 @@ extern int switch_p_job_init(stepd_step_rec_t *job) /* * Configure the network * - * I'm setting exclusive flag to zero for now until we can figure out a - * way to guarantee that the application not only has exclusive access - * to the node but also will not be suspended. This may not happen. - * * Cray shmem still uses the network, even when it's using only one * node, so we must always configure the network. */ - cpu_scaling = get_cpu_scaling(job); - if (cpu_scaling == -1) { - return SLURM_ERROR; + launch_params = slurm_get_launch_params(); + if (launch_params && strstr(launch_params, "cray_net_exclusive")) { + /* + * Grant exclusive access and all aries resources to the job. + * Not recommended if you may run multiple steps within + * the job, and will cause problems if you suspend or allow + * nodes to be shared across multiple jobs. + */ + /* + * TODO: determine if this can be managed per-job, rather + * than globally across the cluster. + */ + exclusive = 1; } + xfree(launch_params); - mem_scaling = get_mem_scaling(job); - if (mem_scaling == -1) { - return SLURM_ERROR; + if (!exclusive) { + /* + * Calculate percentages of cpu and mem to assign to + * non-exclusive jobs. + */ + + cpu_scaling = get_cpu_scaling(job); + if (cpu_scaling == -1) + return SLURM_ERROR; + + mem_scaling = get_mem_scaling(job); + if (mem_scaling == -1) + return SLURM_ERROR; } if (debug_flags & DEBUG_FLAG_SWITCH) { - CRAY_INFO("Network Scaling: CPU %d Memory %d", - cpu_scaling, mem_scaling); + CRAY_INFO("Network Scaling: Exclusive %d CPU %d Memory %d", + exclusive, cpu_scaling, mem_scaling); } - rc = alpsc_configure_nic(&err_msg, 0, cpu_scaling, mem_scaling, + rc = alpsc_configure_nic(&err_msg, exclusive, cpu_scaling, mem_scaling, cont_id, sw_job->num_cookies, (const char **) sw_job->cookies, &num_ptags, &ptags, NULL); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index ea6c96c61932d32bdee2aad1983effa5f2da772d..4f3bccc33dc90a26b022f90ca3bcfc255fcc08de 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -2735,15 +2735,38 @@ static void _slurm_rpc_job_alloc_info(slurm_msg_t * msg) job_info_resp_msg.resv_name = xstrdup(job_ptr->resv_name); job_info_resp_msg.select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); - if (job_ptr->details->env_cnt) { - job_info_resp_msg.env_size = job_ptr->details->env_cnt; - job_info_resp_msg.environment = - xmalloc(sizeof(char *) * - job_info_resp_msg.env_size); - for (i = 0; i < job_info_resp_msg.env_size; i++) { - job_info_resp_msg.environment[i] = - xstrdup(job_ptr->details->env_sup[i]); + if (job_ptr->details) { + job_info_resp_msg.pn_min_memory = + job_ptr->details->pn_min_memory; + + if (job_ptr->details->mc_ptr) { + job_info_resp_msg.ntasks_per_board = + job_ptr->details->mc_ptr-> + ntasks_per_board; + job_info_resp_msg.ntasks_per_core = + job_ptr->details->mc_ptr-> + ntasks_per_core; + job_info_resp_msg.ntasks_per_socket = + job_ptr->details->mc_ptr-> + ntasks_per_socket; + } + + if (job_ptr->details->env_cnt) { + job_info_resp_msg.env_size = + job_ptr->details->env_cnt; + job_info_resp_msg.environment = + xmalloc(sizeof(char *) * + job_info_resp_msg.env_size); + for (i = 0; i < job_info_resp_msg.env_size; i++) + job_info_resp_msg.environment[i] = + xstrdup(job_ptr->details-> + env_sup[i]); } + } else { + job_info_resp_msg.pn_min_memory = 0; + job_info_resp_msg.ntasks_per_board = (uint16_t)NO_VAL; + job_info_resp_msg.ntasks_per_core = (uint16_t)NO_VAL; + job_info_resp_msg.ntasks_per_socket = (uint16_t)NO_VAL; } unlock_slurmctld(job_read_lock); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 8d9ef02a0fb078016a58660af1eb460db00b1fc7..a909344c78da9f755af10aade4908df81b96a302 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2172,6 +2172,7 @@ static int _calc_cpus_per_task(job_step_create_request_msg_t *step_specs, struct job_record *job_ptr) { int cpus_per_task = 0, i; + int num_tasks; if ((step_specs->cpu_count == 0) || (step_specs->cpu_count % step_specs->num_tasks)) @@ -2184,14 +2185,20 @@ static int _calc_cpus_per_task(job_step_create_request_msg_t *step_specs, if (!job_ptr->job_resrcs) return cpus_per_task; + num_tasks = step_specs->num_tasks; for (i = 0; i < job_ptr->job_resrcs->cpu_array_cnt; i++) { - if ((cpus_per_task > job_ptr->job_resrcs->cpu_array_value[i]) || - (job_ptr->job_resrcs->cpu_array_value[i] % cpus_per_task)) { + if (cpus_per_task > job_ptr->job_resrcs->cpu_array_value[i]) { cpus_per_task = 0; break; } + num_tasks -= (job_ptr->job_resrcs->cpu_array_value[i] / + cpus_per_task) * + job_ptr->job_resrcs->cpu_array_reps[i]; } + if (num_tasks > 0) + return 0; + return cpus_per_task; } @@ -3434,15 +3441,6 @@ extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, step_ptr = find_step_record(job_ptr, req->job_step_id); - /* FIXME: It was changed in 16.05.3 to make the extern step - * at the beginning of the job, so this isn't really needed - * anymore, but just in case there were steps out on the nodes - * during an upgrade this was left in. It can probably be - * taken out in future releases though. - */ - if ((step_ptr == NULL) && (req->job_step_id == SLURM_EXTERN_CONT)) - step_ptr = build_extern_step(job_ptr); - if (step_ptr == NULL) { info("step_partial_comp: StepID=%u.%u invalid", req->job_id, req->job_step_id); diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index 66b46c9899d4717070fa57d95f7bf761074d9d06..5fc6d90559ee654f9744ac15d3a3b603c3f4a08f 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -376,8 +376,12 @@ job_step_create_allocation(resource_allocation_response_msg_t *resp) ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; ai->ntasks_per_board = resp->ntasks_per_board; - ai->ntasks_per_core = resp->ntasks_per_core; - ai->ntasks_per_socket = resp->ntasks_per_socket; + + /* Here let the srun options override the allocation resp */ + ai->ntasks_per_core = (opt.ntasks_per_core != NO_VAL) ? + opt.ntasks_per_core : resp->ntasks_per_core; + ai->ntasks_per_socket = (opt.ntasks_per_socket != NO_VAL) ? + opt.ntasks_per_socket : resp->ntasks_per_socket; ai->partition = resp->partition;