diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 62e0d9057c9d35c59bdba04fb7e8c0b974dcc6c5..91b58a79d9d7fb03c978a6d3e8214e144ef1f85b 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -973,6 +973,7 @@ typedef struct batch_job_launch_msg { uint16_t job_core_spec; /* Count of specialized cores */ char *alias_list; /* node name/address/hostnamne aliases */ char *nodes; /* list of nodes allocated to job_step */ + uint32_t profile; /* what to profile for the batch step */ char *script; /* the actual job script, default NONE */ char *std_err; /* pathname of stderr */ char *std_in; /* pathname of stdin */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index c54e13c6b09559a38455bc58e10b31dff1de4344..f056c59bc72d173ea7039e8a635ff0c98356bd53 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -12317,7 +12317,68 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer, { xassert(msg != NULL); - if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { + pack32(msg->job_id, buffer); + pack32(msg->step_id, buffer); + pack32(msg->uid, buffer); + packstr(msg->partition, buffer); + packstr(msg->user_name, buffer); + pack32(msg->gid, buffer); + pack32(msg->ntasks, buffer); + pack32(msg->pn_min_memory, buffer); + + pack8(msg->open_mode, buffer); + pack8(msg->overcommit, buffer); + + pack32(msg->array_job_id, buffer); + pack32(msg->array_task_id, buffer); + + packstr(msg->acctg_freq, buffer); + pack16(msg->cpu_bind_type, buffer); + pack16(msg->cpus_per_task, buffer); + pack16(msg->restart_cnt, buffer); + pack16(msg->job_core_spec, buffer); + + pack32(msg->num_cpu_groups, buffer); + if (msg->num_cpu_groups) { + pack16_array(msg->cpus_per_node, msg->num_cpu_groups, + buffer); + pack32_array(msg->cpu_count_reps, msg->num_cpu_groups, + buffer); + } + + packstr(msg->alias_list, buffer); + packstr(msg->cpu_bind, buffer); + packstr(msg->nodes, buffer); + packstr(msg->script, buffer); + packstr(msg->work_dir, buffer); + packstr(msg->ckpt_dir, buffer); + packstr(msg->restart_dir, buffer); + + packstr(msg->std_err, buffer); + packstr(msg->std_in, buffer); + packstr(msg->std_out, buffer); + + pack32(msg->argc, buffer); + packstr_array(msg->argv, msg->argc, buffer); + packstr_array(msg->spank_job_env, msg->spank_job_env_size, + buffer); + + pack32(msg->envc, buffer); + packstr_array(msg->environment, msg->envc, buffer); + + pack32(msg->job_mem, buffer); + + slurm_cred_pack(msg->cred, buffer, protocol_version); + + select_g_select_jobinfo_pack(msg->select_jobinfo, buffer, + protocol_version); + + packstr(msg->account, buffer); + packstr(msg->qos, buffer); + packstr(msg->resv_name, buffer); + pack32(msg->profile, buffer); + } else if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { pack32(msg->job_id, buffer); pack32(msg->step_id, buffer); pack32(msg->uid, buffer); @@ -12451,7 +12512,98 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer, launch_msg_ptr = xmalloc(sizeof(batch_job_launch_msg_t)); *msg = launch_msg_ptr; - if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { + safe_unpack32(&launch_msg_ptr->job_id, buffer); + safe_unpack32(&launch_msg_ptr->step_id, buffer); + safe_unpack32(&launch_msg_ptr->uid, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->partition, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->user_name, + &uint32_tmp, buffer); + safe_unpack32(&launch_msg_ptr->gid, buffer); + safe_unpack32(&launch_msg_ptr->ntasks, buffer); + safe_unpack32(&launch_msg_ptr->pn_min_memory, buffer); + + safe_unpack8(&launch_msg_ptr->open_mode, buffer); + safe_unpack8(&launch_msg_ptr->overcommit, buffer); + + safe_unpack32(&launch_msg_ptr->array_job_id, buffer); + safe_unpack32(&launch_msg_ptr->array_task_id, buffer); + + safe_unpackstr_xmalloc(&launch_msg_ptr->acctg_freq, + &uint32_tmp, buffer); + safe_unpack16(&launch_msg_ptr->cpu_bind_type, buffer); + safe_unpack16(&launch_msg_ptr->cpus_per_task, buffer); + safe_unpack16(&launch_msg_ptr->restart_cnt, buffer); + safe_unpack16(&launch_msg_ptr->job_core_spec, buffer); + + safe_unpack32(&launch_msg_ptr->num_cpu_groups, buffer); + if (launch_msg_ptr->num_cpu_groups) { + safe_unpack16_array(&(launch_msg_ptr->cpus_per_node), + &uint32_tmp, buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + safe_unpack32_array(&(launch_msg_ptr->cpu_count_reps), + &uint32_tmp, buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + } + + safe_unpackstr_xmalloc(&launch_msg_ptr->alias_list, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->cpu_bind, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->nodes, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->script, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->work_dir, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->ckpt_dir, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->restart_dir, + &uint32_tmp, buffer); + + safe_unpackstr_xmalloc(&launch_msg_ptr->std_err, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->std_in, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->std_out, &uint32_tmp, + buffer); + + safe_unpack32(&launch_msg_ptr->argc, buffer); + safe_unpackstr_array(&launch_msg_ptr->argv, + &launch_msg_ptr->argc, buffer); + safe_unpackstr_array(&launch_msg_ptr->spank_job_env, + &launch_msg_ptr->spank_job_env_size, + buffer); + + safe_unpack32(&launch_msg_ptr->envc, buffer); + safe_unpackstr_array(&launch_msg_ptr->environment, + &launch_msg_ptr->envc, buffer); + + safe_unpack32(&launch_msg_ptr->job_mem, buffer); + + if (!(launch_msg_ptr->cred = slurm_cred_unpack( + buffer, protocol_version))) + goto unpack_error; + + if (select_g_select_jobinfo_unpack(&launch_msg_ptr-> + select_jobinfo, + buffer, protocol_version)) + goto unpack_error; + + safe_unpackstr_xmalloc(&launch_msg_ptr->account, + &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->qos, + &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->resv_name, + &uint32_tmp, + buffer); + safe_unpack32(&launch_msg_ptr->profile, buffer); + } else if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { safe_unpack32(&launch_msg_ptr->job_id, buffer); safe_unpack32(&launch_msg_ptr->step_id, buffer); safe_unpack32(&launch_msg_ptr->uid, buffer); diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index e03f48163847b490006b1b42de158b851e1393b0..7bb5e50af70716055d331e0ec11743e9aaf11099 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -2027,6 +2027,7 @@ extern batch_job_launch_msg_t *build_launch_job_msg(struct job_record *job_ptr, launch_msg_ptr->cpus_per_task = job_ptr->details->cpus_per_task; launch_msg_ptr->pn_min_memory = job_ptr->details->pn_min_memory; launch_msg_ptr->restart_cnt = job_ptr->restart_cnt; + launch_msg_ptr->profile = job_ptr->profile; if (make_batch_job_cred(launch_msg_ptr, job_ptr, protocol_version)) { /* FIXME: This is a kludge, but this event indicates a serious diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index aa3dc4a00c34972778f598ce2c988018fff83d6e..54f222b91c511f1db08a70decd0298d34fb4d61b 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -4675,6 +4675,7 @@ static int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, launch_msg_ptr->uid = uid; launch_msg_ptr->nodes = xstrdup(job_ptr->alias_list); launch_msg_ptr->partition = xstrdup(job_ptr->partition); + launch_msg_ptr->restart_cnt = job_ptr->restart_cnt; if (job_ptr->details) { launch_msg_ptr->pn_min_memory = job_ptr->details-> @@ -4735,6 +4736,11 @@ static int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, launch_msg_ptr->select_jobinfo = select_g_select_jobinfo_copy( job_ptr->select_jobinfo); + if (job_desc_msg->profile != ACCT_GATHER_PROFILE_NOT_SET) + launch_msg_ptr->profile = job_desc_msg->profile; + else + launch_msg_ptr->profile = job_ptr->profile; + /* FIXME: for some reason these CPU arrays total all the CPUs * actually allocated, rather than totaling up to the requested * CPU count for the allocation. diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index efedd5defd3f1b87cf7dae2bda0dc887b894b1a6..31b94eed3b621976db38dbf32942c65ca806ffc5 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -457,6 +457,7 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->batch = true; job->node_name = xstrdup(conf->node_name); job->user_name = xstrdup(msg->user_name); + job->profile = msg->profile; /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */