diff --git a/src/common/slurm_cred.c b/src/common/slurm_cred.c index cb8879ee1b2405924072aa7c16e97170bb442c0a..e7f7e8ea1d6266dff214b4cf518f111f7ea97fb2 100644 --- a/src/common/slurm_cred.c +++ b/src/common/slurm_cred.c @@ -721,6 +721,40 @@ slurm_cred_faker(slurm_cred_arg_t *arg) } +void slurm_cred_free_args(slurm_cred_arg_t *arg) +{ + xfree(arg->hostlist); + xfree(arg->alloc_lps); + arg->alloc_lps_cnt = 0; +} + +int +slurm_cred_get_args(slurm_cred_t cred, slurm_cred_arg_t *arg) +{ + xassert(cred != NULL); + xassert(arg != NULL); + + /* + * set arguments to cred contents + */ + slurm_mutex_lock(&cred->mutex); + arg->jobid = cred->jobid; + arg->stepid = cred->stepid; + arg->uid = cred->uid; + arg->job_mem = cred->job_mem; + arg->task_mem = cred->task_mem; + arg->hostlist = xstrdup(cred->nodes); + arg->alloc_lps_cnt = cred->alloc_lps_cnt; + if (arg->alloc_lps_cnt > 0) { + arg->alloc_lps = xmalloc(arg->alloc_lps_cnt * sizeof(uint32_t)); + memcpy(arg->alloc_lps, cred->alloc_lps, + arg->alloc_lps_cnt * sizeof(uint32_t)); + } else + arg->alloc_lps = NULL; + slurm_mutex_unlock(&cred->mutex); + + return SLURM_SUCCESS; +} int slurm_cred_verify(slurm_cred_ctx_t ctx, slurm_cred_t cred, @@ -775,13 +809,13 @@ slurm_cred_verify(slurm_cred_ctx_t ctx, slurm_cred_t cred, arg->job_mem = cred->job_mem; arg->task_mem = cred->task_mem; arg->hostlist = xstrdup(cred->nodes); - arg->alloc_lps_cnt = cred->alloc_lps_cnt; - arg->alloc_lps = NULL; - if (arg->alloc_lps_cnt > 0) { - arg->alloc_lps = xmalloc(arg->alloc_lps_cnt * sizeof(uint32_t)); - memcpy(arg->alloc_lps, cred->alloc_lps, - arg->alloc_lps_cnt * sizeof(uint32_t)); - } + arg->alloc_lps_cnt = cred->alloc_lps_cnt; + if (arg->alloc_lps_cnt > 0) { + arg->alloc_lps = xmalloc(arg->alloc_lps_cnt * sizeof(uint32_t)); + memcpy(arg->alloc_lps, cred->alloc_lps, + arg->alloc_lps_cnt * sizeof(uint32_t)); + } else + arg->alloc_lps = NULL; slurm_mutex_unlock(&cred->mutex); diff --git a/src/common/slurm_cred.h b/src/common/slurm_cred.h index cfe67fd61c80d815c4caa5d3c3b197f46cec3c62..7fe6b226b7143965315b8c402544f25cbfeaab65 100644 --- a/src/common/slurm_cred.h +++ b/src/common/slurm_cred.h @@ -165,6 +165,13 @@ slurm_cred_t slurm_cred_copy(slurm_cred_t cred); */ slurm_cred_t slurm_cred_faker(slurm_cred_arg_t *arg); +/* Free the credential arguments as loaded by either + * slurm_cred_get_args() or slurm_cred_verify() */ +void slurm_cred_free_args(slurm_cred_arg_t *arg); + +/* Make a copy of the credential's arguements */ +int slurm_cred_get_args(slurm_cred_t cred, slurm_cred_arg_t *arg); + /* * Verify the signed credential `cred,' and return cred contents in * the cred_arg structure. The credential is cached and cannot be reused. diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 457827d24aa0f7826546097f1d417ae0f109c304..6f0936a9cada0efea58085d2eb62d6526b72a73c 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -458,7 +458,7 @@ extern void launch_job(struct job_record *job_ptr) launch_msg_ptr->open_mode = job_ptr->details->open_mode; launch_msg_ptr->acctg_freq = job_ptr->details->acctg_freq; - if (make_batch_job_cred(launch_msg_ptr)) { + if (make_batch_job_cred(launch_msg_ptr, job_ptr)) { error("aborting batch job %u", job_ptr->job_id); /* FIXME: This is a kludge, but this event indicates a serious * problem with OpenSSH and should never happen. We are @@ -524,9 +524,11 @@ _xduparray(uint16_t size, char ** array) * make_batch_job_cred - add a job credential to the batch_job_launch_msg * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id, * uid and nodes have already been set + * IN job_ptr - pointer to job record * RET 0 or error code */ -extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr) +extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, + struct job_record *job_ptr) { slurm_cred_arg_t cred_arg; @@ -534,6 +536,15 @@ extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr) cred_arg.stepid = launch_msg_ptr->step_id; cred_arg.uid = launch_msg_ptr->uid; cred_arg.hostlist = launch_msg_ptr->nodes; + if (job_ptr->details == NULL) + cred_arg.job_mem = 0; + else if (job_ptr->details->job_min_memory & MEM_PER_CPU) { + cred_arg.job_mem = job_ptr->details->job_min_memory; + cred_arg.job_mem &= (~MEM_PER_CPU); + cred_arg.job_mem *= job_ptr->alloc_lps[0]; + } else + cred_arg.job_mem = job_ptr->details->job_min_memory; + cred_arg.alloc_lps_cnt = 0; cred_arg.alloc_lps = NULL; diff --git a/src/slurmctld/job_scheduler.h b/src/slurmctld/job_scheduler.h index b40310137dcf06f569fa9b53d8370ce9c5b4e265..f09cb7e728393032185a04fc12a0a82320e0eb80 100644 --- a/src/slurmctld/job_scheduler.h +++ b/src/slurmctld/job_scheduler.h @@ -82,9 +82,11 @@ extern void launch_job(struct job_record *job_ptr); * make_batch_job_cred - add a job credential to the batch_job_launch_msg * IN/OUT launch_msg_ptr - batch_job_launch_msg in which job_id, step_id, * uid and nodes have already been set + * IN job_ptr - pointer to job record * RET 0 or error code */ -extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr); +extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, + struct job_record *job_ptr); /* Print a job's dependency information based upon job_ptr->depend_list */ extern void print_job_dependency(struct job_record *job_ptr); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index c9f85c362a528b17808fb23647dcb797df9ee0a5..623cb3b482f4d03651c1c3301ee0d6a0eef54dc7 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -503,22 +503,47 @@ static int _make_step_cred(struct step_record *step_rec, slurm_cred_t *slurm_cred) { slurm_cred_arg_t cred_arg; + struct job_record* job_ptr = step_rec->job_ptr; - cred_arg.jobid = step_rec->job_ptr->job_id; + cred_arg.jobid = job_ptr->job_id; cred_arg.stepid = step_rec->step_id; - cred_arg.uid = step_rec->job_ptr->user_id; - cred_arg.job_mem = step_rec->job_ptr->details->job_min_memory; + cred_arg.uid = job_ptr->user_id; + cred_arg.job_mem = job_ptr->details->job_min_memory; cred_arg.task_mem = step_rec->mem_per_task; cred_arg.hostlist = step_rec->step_layout->node_list; - if(step_rec->job_ptr->details->shared == 0) - cred_arg.alloc_lps_cnt = 0; - else - cred_arg.alloc_lps_cnt = step_rec->job_ptr->alloc_lps_cnt; - if (cred_arg.alloc_lps_cnt > 0) { - cred_arg.alloc_lps = xmalloc(cred_arg.alloc_lps_cnt * + + if (job_ptr->details->shared == 0) + cred_arg.alloc_lps_cnt = 0; + else + cred_arg.alloc_lps_cnt = job_ptr->alloc_lps_cnt; + + if ((cred_arg.alloc_lps_cnt > 0) && + bit_equal(job_ptr->node_bitmap, step_rec->step_node_bitmap)) { + cred_arg.alloc_lps = xmalloc(cred_arg.alloc_lps_cnt * + sizeof(uint32_t)); + memcpy(cred_arg.alloc_lps, step_rec->job_ptr->alloc_lps, + cred_arg.alloc_lps_cnt*sizeof(uint32_t)); + } else if (cred_arg.alloc_lps_cnt > 0) { + /* Construct an array of allocated CPUs per node. + * Translate from array based upon job's allocation + * to array based upon nodes allocated to the step. */ + int i, job_inx = -1, step_inx = -1; + int job_inx_target = job_ptr->node_cnt; + cred_arg.alloc_lps = xmalloc(cred_arg.alloc_lps_cnt * sizeof(uint32_t)); - memcpy(cred_arg.alloc_lps, step_rec->job_ptr->alloc_lps, - cred_arg.alloc_lps_cnt*sizeof(uint32_t)); + for (i=0; i<node_record_count; i++) { + if (!bit_test(job_ptr->node_bitmap, i)) + continue; + job_inx++; + if (!bit_test(step_rec->step_node_bitmap, i)) + continue; + step_inx++; + cred_arg.alloc_lps[step_inx] = + job_ptr->alloc_lps[job_inx]; + if (job_inx == job_inx_target) + break; + } + cred_arg.alloc_lps_cnt = step_inx + 1; } else cred_arg.alloc_lps = NULL; @@ -2649,7 +2674,7 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, launch_msg_ptr->uid = uid; launch_msg_ptr->nodes = xstrdup(job_ptr->nodes); - if (make_batch_job_cred(launch_msg_ptr)) { + if (make_batch_job_cred(launch_msg_ptr, job_ptr)) { error("aborting batch step %u.%u", job_ptr->job_id, job_ptr->group_id); xfree(launch_msg_ptr->nodes); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index a7cdb943379b9dedb6fc7e953824d5f1968efd29..35fbf7038bf2ef5019a43c6b68b8688a8f627220 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -612,7 +612,7 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, */ static int _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, - int tasks_to_launch, hostset_t *step_hset) + int node_id, hostset_t *step_hset) { slurm_cred_arg_t arg; hostset_t hset = NULL; @@ -623,7 +623,7 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, slurm_cred_t cred = req->cred; uint32_t jobid = req->job_id; uint32_t stepid = req->job_step_id; - + int tasks_to_launch = req->tasks_to_launch[node_id]; /* * First call slurm_cred_verify() so that all valid * credentials are checked @@ -645,8 +645,7 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, if (rc >= 0) { if ((hset = hostset_create(arg.hostlist))) *step_hset = hset; - xfree(arg.hostlist); - xfree(arg.alloc_lps); + slurm_cred_free_args(&arg); } return SLURM_SUCCESS; } @@ -681,8 +680,8 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, if ((arg.alloc_lps_cnt > 0) && (tasks_to_launch > 0)) { host_index = hostset_find(hset, conf->node_name); - /* Left in here for debugging purposes */ #if(0) + /* Left for debugging purposes */ if (host_index >= 0) info(" cons_res %u alloc_lps_cnt %u " "task[%d] = %u = task_to_launch %d host %s ", @@ -692,11 +691,14 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, #endif if (host_index < 0) { - error("job cr credential invalid host_index %d for job %u", - host_index, arg.jobid); + error("job cr credential invalid host_index %d for " + "job %u", host_index, arg.jobid); goto fail; } - + if (host_index > arg.alloc_lps_cnt) + error("host_index > alloc_lps_cnt in credential"); + else if (arg.alloc_lps[host_index] == 0) + error("cons_res: zero processors allocated to step"); if (tasks_to_launch > arg.alloc_lps[host_index]) { error("cons_res: More than one tasks per logical " "processor (%d > %u) on host [%u.%u %ld %s] ", @@ -708,33 +710,33 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid, } } - /* Overwrite any memory limits in the RPC with - * contents of the credential */ + /* Overwrite any memory limits in the RPC with contents of the + * memory limit within the credential. + * Reset the CPU count on this node to correct value. */ if (arg.job_mem & MEM_PER_CPU) { req->job_mem = arg.job_mem & (~MEM_PER_CPU); - if (host_index >= 0) + if ((host_index >= 0) && (host_index < arg.alloc_lps_cnt) && + (arg.alloc_lps[host_index] > 0)) req->job_mem *= arg.alloc_lps[host_index]; } else req->job_mem = arg.job_mem; req->task_mem = arg.task_mem; /* Defunct */ + if ((host_index >= 0) && (host_index < arg.alloc_lps_cnt)) + req->cpus_allocated[node_id] = arg.alloc_lps[host_index]; #if 0 info("mem orig:%u cpus:%u limit:%u", arg.job_mem, arg.alloc_lps[host_index], req->job_mem); #endif *step_hset = hset; - xfree(arg.hostlist); - arg.alloc_lps_cnt = 0; - xfree(arg.alloc_lps); + slurm_cred_free_args(&arg); return SLURM_SUCCESS; fail: if (hset) hostset_destroy(hset); *step_hset = NULL; - xfree(arg.hostlist); - arg.alloc_lps_cnt = 0; - xfree(arg.alloc_lps); + slurm_cred_free_args(&arg); slurm_seterrno_ret(ESLURMD_INVALID_JOB_CREDENTIAL); } @@ -775,8 +777,7 @@ _rpc_launch_tasks(slurm_msg_t *msg) req->job_step_id, req->uid, req->gid, host, port); first_job_run = !slurm_cred_jobid_cached(conf->vctx, req->job_id); - if (_check_job_credential(req, req_uid, req->tasks_to_launch[nodeid], - &step_hset) < 0) { + if (_check_job_credential(req, req_uid, nodeid, &step_hset) < 0) { errnum = errno; error("Invalid job credential from %ld@%s: %m", (long) req_uid, host); @@ -817,7 +818,9 @@ _rpc_launch_tasks(slurm_msg_t *msg) job_limits_ptr->job_id = req->job_id; list_append(job_limits_list, job_limits_ptr); } - job_limits_ptr->job_mem = req->job_mem; /* reset limit */ + /* reset memory limit based upon value calculated in + * _check_job_credential() above */ + job_limits_ptr->job_mem = req->job_mem; slurm_mutex_unlock(&job_limits_mutex); } @@ -928,6 +931,28 @@ _get_user_env(batch_job_launch_msg_t *req) xfree(pwd_buf); } +/* The RPC currently contains a memory size limit, but we load the + * value from the job credential to be certain it has not been + * altered by the user */ +static void +_set_batch_job_limits(slurm_msg_t *msg) +{ + slurm_cred_arg_t arg; + batch_job_launch_msg_t *req = (batch_job_launch_msg_t *)msg->data; + + if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) + return; + + if (arg.job_mem & MEM_PER_CPU) { + req->job_mem = arg.job_mem & (~MEM_PER_CPU); + if (arg.alloc_lps_cnt > 1) + req->job_mem *= arg.alloc_lps_cnt; + } else + req->job_mem = arg.job_mem; + + slurm_cred_free_args(&arg); +} + static void _rpc_batch_job(slurm_msg_t *msg) { @@ -995,6 +1020,8 @@ _rpc_batch_job(slurm_msg_t *msg) goto done; } } + _get_user_env(req); + _set_batch_job_limits(msg); /* Since job could have been killed while the prolog was * running (especially on BlueGene, which can take minutes @@ -1006,7 +1033,6 @@ _rpc_batch_job(slurm_msg_t *msg) rc = ESLURMD_CREDENTIAL_REVOKED; /* job already ran */ goto done; } - _get_user_env(req); slurm_mutex_lock(&launch_mutex); if (req->step_id == SLURM_BATCH_SCRIPT) diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index 4cb2351cb3d8cfda8a1b949375702885f630ba18..45b1b7bb784dad6eeab8a2db8de38c7b419d97f2 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -325,6 +325,7 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->state = SLURMSTEPD_STEP_STARTING; job->pwd = pwd; + job->cpus = msg->cpus_per_node[0]; job->ntasks = 1; job->nprocs = msg->nprocs; job->jobid = msg->job_id;