From 64f1f691484f99a732dfd1a49a0e53c5e76a6d56 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Mon, 29 Feb 2016 14:34:27 -0800 Subject: [PATCH] Add profile request to the batch step, continuation of commit ff2c5b887db --- src/common/slurm_protocol_defs.h | 1 + src/common/slurm_protocol_pack.c | 156 ++++++++++++++++++++++++- src/slurmctld/job_scheduler.c | 1 + src/slurmctld/proc_req.c | 6 + src/slurmd/slurmstepd/slurmstepd_job.c | 1 + 5 files changed, 163 insertions(+), 2 deletions(-) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 62e0d9057c9..91b58a79d9d 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -973,6 +973,7 @@ typedef struct batch_job_launch_msg { uint16_t job_core_spec; /* Count of specialized cores */ char *alias_list; /* node name/address/hostnamne aliases */ char *nodes; /* list of nodes allocated to job_step */ + uint32_t profile; /* what to profile for the batch step */ char *script; /* the actual job script, default NONE */ char *std_err; /* pathname of stderr */ char *std_in; /* pathname of stdin */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index c54e13c6b09..f056c59bc72 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -12317,7 +12317,68 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer, { xassert(msg != NULL); - if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { + pack32(msg->job_id, buffer); + pack32(msg->step_id, buffer); + pack32(msg->uid, buffer); + packstr(msg->partition, buffer); + packstr(msg->user_name, buffer); + pack32(msg->gid, buffer); + pack32(msg->ntasks, buffer); + pack32(msg->pn_min_memory, buffer); + + pack8(msg->open_mode, buffer); + pack8(msg->overcommit, buffer); + + pack32(msg->array_job_id, buffer); + pack32(msg->array_task_id, buffer); + + packstr(msg->acctg_freq, buffer); + pack16(msg->cpu_bind_type, buffer); + pack16(msg->cpus_per_task, buffer); + pack16(msg->restart_cnt, buffer); + pack16(msg->job_core_spec, buffer); + + pack32(msg->num_cpu_groups, buffer); + if (msg->num_cpu_groups) { + pack16_array(msg->cpus_per_node, msg->num_cpu_groups, + buffer); + pack32_array(msg->cpu_count_reps, msg->num_cpu_groups, + buffer); + } + + packstr(msg->alias_list, buffer); + packstr(msg->cpu_bind, buffer); + packstr(msg->nodes, buffer); + packstr(msg->script, buffer); + packstr(msg->work_dir, buffer); + packstr(msg->ckpt_dir, buffer); + packstr(msg->restart_dir, buffer); + + packstr(msg->std_err, buffer); + packstr(msg->std_in, buffer); + packstr(msg->std_out, buffer); + + pack32(msg->argc, buffer); + packstr_array(msg->argv, msg->argc, buffer); + packstr_array(msg->spank_job_env, msg->spank_job_env_size, + buffer); + + pack32(msg->envc, buffer); + packstr_array(msg->environment, msg->envc, buffer); + + pack32(msg->job_mem, buffer); + + slurm_cred_pack(msg->cred, buffer, protocol_version); + + select_g_select_jobinfo_pack(msg->select_jobinfo, buffer, + protocol_version); + + packstr(msg->account, buffer); + packstr(msg->qos, buffer); + packstr(msg->resv_name, buffer); + pack32(msg->profile, buffer); + } else if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { pack32(msg->job_id, buffer); pack32(msg->step_id, buffer); pack32(msg->uid, buffer); @@ -12451,7 +12512,98 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer, launch_msg_ptr = xmalloc(sizeof(batch_job_launch_msg_t)); *msg = launch_msg_ptr; - if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { + safe_unpack32(&launch_msg_ptr->job_id, buffer); + safe_unpack32(&launch_msg_ptr->step_id, buffer); + safe_unpack32(&launch_msg_ptr->uid, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->partition, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->user_name, + &uint32_tmp, buffer); + safe_unpack32(&launch_msg_ptr->gid, buffer); + safe_unpack32(&launch_msg_ptr->ntasks, buffer); + safe_unpack32(&launch_msg_ptr->pn_min_memory, buffer); + + safe_unpack8(&launch_msg_ptr->open_mode, buffer); + safe_unpack8(&launch_msg_ptr->overcommit, buffer); + + safe_unpack32(&launch_msg_ptr->array_job_id, buffer); + safe_unpack32(&launch_msg_ptr->array_task_id, buffer); + + safe_unpackstr_xmalloc(&launch_msg_ptr->acctg_freq, + &uint32_tmp, buffer); + safe_unpack16(&launch_msg_ptr->cpu_bind_type, buffer); + safe_unpack16(&launch_msg_ptr->cpus_per_task, buffer); + safe_unpack16(&launch_msg_ptr->restart_cnt, buffer); + safe_unpack16(&launch_msg_ptr->job_core_spec, buffer); + + safe_unpack32(&launch_msg_ptr->num_cpu_groups, buffer); + if (launch_msg_ptr->num_cpu_groups) { + safe_unpack16_array(&(launch_msg_ptr->cpus_per_node), + &uint32_tmp, buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + safe_unpack32_array(&(launch_msg_ptr->cpu_count_reps), + &uint32_tmp, buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + } + + safe_unpackstr_xmalloc(&launch_msg_ptr->alias_list, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->cpu_bind, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->nodes, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->script, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->work_dir, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->ckpt_dir, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->restart_dir, + &uint32_tmp, buffer); + + safe_unpackstr_xmalloc(&launch_msg_ptr->std_err, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->std_in, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->std_out, &uint32_tmp, + buffer); + + safe_unpack32(&launch_msg_ptr->argc, buffer); + safe_unpackstr_array(&launch_msg_ptr->argv, + &launch_msg_ptr->argc, buffer); + safe_unpackstr_array(&launch_msg_ptr->spank_job_env, + &launch_msg_ptr->spank_job_env_size, + buffer); + + safe_unpack32(&launch_msg_ptr->envc, buffer); + safe_unpackstr_array(&launch_msg_ptr->environment, + &launch_msg_ptr->envc, buffer); + + safe_unpack32(&launch_msg_ptr->job_mem, buffer); + + if (!(launch_msg_ptr->cred = slurm_cred_unpack( + buffer, protocol_version))) + goto unpack_error; + + if (select_g_select_jobinfo_unpack(&launch_msg_ptr-> + select_jobinfo, + buffer, protocol_version)) + goto unpack_error; + + safe_unpackstr_xmalloc(&launch_msg_ptr->account, + &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->qos, + &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&launch_msg_ptr->resv_name, + &uint32_tmp, + buffer); + safe_unpack32(&launch_msg_ptr->profile, buffer); + } else if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { safe_unpack32(&launch_msg_ptr->job_id, buffer); safe_unpack32(&launch_msg_ptr->step_id, buffer); safe_unpack32(&launch_msg_ptr->uid, buffer); diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index e03f4816384..7bb5e50af70 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -2027,6 +2027,7 @@ extern batch_job_launch_msg_t *build_launch_job_msg(struct job_record *job_ptr, launch_msg_ptr->cpus_per_task = job_ptr->details->cpus_per_task; launch_msg_ptr->pn_min_memory = job_ptr->details->pn_min_memory; launch_msg_ptr->restart_cnt = job_ptr->restart_cnt; + launch_msg_ptr->profile = job_ptr->profile; if (make_batch_job_cred(launch_msg_ptr, job_ptr, protocol_version)) { /* FIXME: This is a kludge, but this event indicates a serious diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index aa3dc4a00c3..54f222b91c5 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -4675,6 +4675,7 @@ static int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, launch_msg_ptr->uid = uid; launch_msg_ptr->nodes = xstrdup(job_ptr->alias_list); launch_msg_ptr->partition = xstrdup(job_ptr->partition); + launch_msg_ptr->restart_cnt = job_ptr->restart_cnt; if (job_ptr->details) { launch_msg_ptr->pn_min_memory = job_ptr->details-> @@ -4735,6 +4736,11 @@ static int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, launch_msg_ptr->select_jobinfo = select_g_select_jobinfo_copy( job_ptr->select_jobinfo); + if (job_desc_msg->profile != ACCT_GATHER_PROFILE_NOT_SET) + launch_msg_ptr->profile = job_desc_msg->profile; + else + launch_msg_ptr->profile = job_ptr->profile; + /* FIXME: for some reason these CPU arrays total all the CPUs * actually allocated, rather than totaling up to the requested * CPU count for the allocation. diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index efedd5defd3..31b94eed3b6 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -457,6 +457,7 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job->batch = true; job->node_name = xstrdup(conf->node_name); job->user_name = xstrdup(msg->user_name); + job->profile = msg->profile; /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ -- GitLab