diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 55a805e4ce4470e86b37ffb15151f93f46b9cf25..0e0aea84b284dd724c1a99903718e3af35b7e48d 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1159,6 +1159,15 @@ Maximum number of tasks SLURM will allow a job step to spawn on a single node. The default \fBMaxTasksPerNode\fR is 128. May not exceed 65533. +.TP +\fBMemLimitEnforce\fR +If set to "no" then Slurm will not terminate the job or the job step +if they exceeds the value requested using the --mem-per-cpu option of +salloc/sbatch/srun. This is usefull if jobs need to specify --mem-per-cpu +for scheduling but they should not be terminate if they exceed the +estimated value. The default value is 'yes', terminate the job/step +if exceed the requested memory. + .TP \fBMessageTimeout\fR Time permitted for a round\-trip communication to complete diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 333767459d84cca1b0904a52e735cac99111f1f7..b161664d50c2922d3dd4d944e8dfa7003805455e 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -2132,6 +2132,7 @@ typedef struct slurm_ctl_conf { uint32_t max_mem_per_cpu; /* maximum MB memory per allocated CPU */ uint32_t max_step_cnt; /* maximum number of steps per job */ uint16_t max_tasks_per_node; /* maximum tasks per node */ + bool mem_limit_enforce; /* Enforce mem limit at runtime y|n */ uint16_t min_job_age; /* COMPLETED jobs over this age (secs) * purged from in memory records */ char *mpi_default; /* Default version of MPI in use */ diff --git a/src/common/read_config.c b/src/common/read_config.c index 48ce72875d5ed2bf732d5a97f7cafdbd5220779e..223dad76291dda5e5b3d3737292dd2fe63b0f1e1 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -241,6 +241,7 @@ s_p_options_t slurm_conf_options[] = { {"MaxMemPerNode", S_P_UINT32}, {"MaxStepCount", S_P_UINT32}, {"MaxTasksPerNode", S_P_UINT16}, + {"MemLimitEnforce", S_P_STRING}, {"MessageTimeout", S_P_UINT16}, {"MinJobAge", S_P_UINT16}, {"MpiDefault", S_P_STRING}, @@ -2280,6 +2281,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->max_job_id = NO_VAL; ctl_conf_ptr->max_mem_per_cpu = 0; ctl_conf_ptr->max_step_cnt = (uint32_t) NO_VAL; + ctl_conf_ptr->mem_limit_enforce = true; ctl_conf_ptr->min_job_age = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->mpi_default); xfree (ctl_conf_ptr->mpi_params); @@ -3940,6 +3942,15 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) return SLURM_ERROR; } #endif + /* The default value is true meaning the memory + * is going to be enforced by slurmstepd and/or + * slurmd. + */ + if (s_p_get_string(&temp_str, "MemLimitEnforce", hashtbl)) { + if (strncasecmp(temp_str, "no", 2) == 0) + conf->mem_limit_enforce = false; + xfree(temp_str); + } xfree(default_storage_type); xfree(default_storage_loc); diff --git a/src/slurmd/common/slurmstepd_init.c b/src/slurmd/common/slurmstepd_init.c index bc7977eab06bcea75475b7ad7d9b585ae299957c..3b15e1a60d6804358d75495e1e764daafabb2c76 100644 --- a/src/slurmd/common/slurmstepd_init.c +++ b/src/slurmd/common/slurmstepd_init.c @@ -72,6 +72,7 @@ extern void pack_slurmd_conf_lite(slurmd_conf_t *conf, Buf buffer) packstr(conf->node_topo_pattern, buffer); pack32((uint32_t)conf->port, buffer); pack16(conf->log_fmt, buffer); + pack16(conf->mem_limit_enforce, buffer); } extern int unpack_slurmd_conf_lite_no_alloc(slurmd_conf_t *conf, Buf buffer) @@ -111,6 +112,7 @@ extern int unpack_slurmd_conf_lite_no_alloc(slurmd_conf_t *conf, Buf buffer) safe_unpackstr_xmalloc(&conf->node_topo_pattern, &uint32_tmp, buffer); safe_unpack32(&uint32_tmp, buffer); safe_unpack16(&conf->log_fmt, buffer); + safe_unpack16(&conf->mem_limit_enforce, buffer); conf->port = uint32_tmp; return SLURM_SUCCESS; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index e2273b6cc5b2277fb7b1d89829c8e1f5fe370e92..e63b796f2c40ed85bb13582c24bf1d984c9eb512 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1985,6 +1985,12 @@ _enforce_job_mem_limit(void) }; struct job_mem_info *job_mem_info_ptr = NULL; + /* If users have configured MemLimitEnforce=no + * in their slurm.conf keep going. + */ + if (conf->mem_limit_enforce == false) + return; + slurm_mutex_lock(&job_limits_mutex); if (!job_limits_loaded) _load_job_limits(); @@ -5234,7 +5240,7 @@ _rpc_forward_data(slurm_msg_t *msg) rc = EINVAL; goto done; } - + /* connect to specified address */ fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd < 0) { @@ -5262,7 +5268,7 @@ _rpc_forward_data(slurm_msg_t *msg) req_uid = htonl(req->len); safe_write(fd, &req_uid, sizeof(uint32_t)); safe_write(fd, req->data, req->len); - + rwfail: done: if (fd >= 0) diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index c06539b8bd9c5c9b44e58b102af653d47c22905d..4915e8c35ee52663d6e7af7bc2e7d4eedcb50cfc 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -943,6 +943,8 @@ _read_config(void) conf->use_pam = cf->use_pam; conf->task_plugin_param = cf->task_plugin_param; + conf->mem_limit_enforce = cf->mem_limit_enforce; + slurm_mutex_unlock(&conf->config_mutex); slurm_conf_unlock(); } diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h index 4c3ef6964d5d2198712b570978be4eff1b8bc01a..a9ad06c9673bcadd4e81e29308876404b4bd66a7 100644 --- a/src/slurmd/slurmd/slurmd.h +++ b/src/slurmd/slurmd/slurmd.h @@ -101,6 +101,7 @@ typedef struct slurmd_config { uint16_t cr_type; /* Consumable Resource Type: * * CR_SOCKET, CR_CORE, CR_MEMORY, * * CR_DEFAULT, etc. */ + uint16_t mem_limit_enforce; /* enforce mem limit on running job */ int nice; /* command line nice value spec */ char *node_name; /* node name */ char *node_addr; /* node's address */ diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index cc2ebd0c341a4e21d7a4cde5427509af13c1f7e3..e9750602d06bf3e0b58574be5627ca30e2c1a573 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -447,10 +447,16 @@ stepd_step_rec_create(launch_tasks_request_msg_t *msg) format_core_allocs(msg->cred, conf->node_name, conf->cpus, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); - if (job->step_mem) { + + /* If users have configured MemLimitEnforce=no + * in their slurm.conf keep going. + */ + if (job->step_mem + && conf->mem_limit_enforce) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->step_mem); - } else if (job->job_mem) { + } else if (job->job_mem + && conf->mem_limit_enforce) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->job_mem); } @@ -553,9 +559,11 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) format_core_allocs(msg->cred, conf->node_name, conf->cpus, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); - if (job->step_mem) + if (job->step_mem + && conf->mem_limit_enforce) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem); - else if (job->job_mem) + else if (job->job_mem + && conf->mem_limit_enforce) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem); get_cred_gres(msg->cred, conf->node_name,