diff --git a/NEWS b/NEWS index 32cc70e727911d69f59b8a9b998c4d3e51418009..a38e0f57bf710d7ce6b39641996e8b18c405649f 100644 --- a/NEWS +++ b/NEWS @@ -26,7 +26,8 @@ documents those changes that are of interest to users and admins. cores, or threads as appropriated based upon resource allocation and task count. User can override with srun's --cpu_bind option. -- Fix bug in backfill logic for select/cons_res plugin, resulted in - error "cons_res:_rm_job_from_res: node_state mis-count" + error "cons_res:_rm_job_from_res: node_state mis-count". + -- Add logic go bind a batch job to the resources allocated to that job. * Changes in SLURM 1.4.0-pre7 ============================= diff --git a/doc/html/taskplugins.shtml b/doc/html/taskplugins.shtml index 7af88daa853314b2bb544f85760101fb872379fb..f476e2e94384c3c09cafcbc7bf412f9989ece3f3 100644 --- a/doc/html/taskplugins.shtml +++ b/doc/html/taskplugins.shtml @@ -46,6 +46,22 @@ SLURM_ERROR.</p> <p>The following functions must appear. Functions which are not implemented should be stubbed.</p> +<p class="commandline">int task_slurmd_batch_request (uint32_t job_id, +batch_job_launch_msg_t *req);</p> +<p style="margin-left:.2in"><b>Description</b>: Prepare to launch a batch job. +Establish node, socket, and core resource availability for it. +Executed by the <b>slurmd</b> daemon as user root.</p> +<p style="margin-left:.2in"><b>Arguments</b>:<br> +<span class="commandline">job_id</span> (input) +ID of the job to be started.<br> +<span class="commandline">req</span> (input/output) +Batch job launch request specification. +See <b>src/common/slurm_protocol_defs.h</b> for the +data structure definition.</p> +<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. +On failure, the plugin should return SLURM_ERROR and set the errno to an +appropriate value to indicate the reason for failure.</p> + <p class="commandline">int task_slurmd_launch_request (uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id);</p> <p style="margin-left:.2in"><b>Description</b>: Prepare to launch a job. @@ -164,6 +180,6 @@ appropriate value to indicate the reason for failure.</p> Future releases of SLURM may revise this API.</p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 28 May 2008</p> +<p style="text-align:center;">Last modified 19 February 2009</p> <!--#include virtual="footer.txt"--> diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 5a1f6d4966b38937ab9a45cd2e3118305dc37bc6..b58752bbb528968f68258c3767a16fa44f4df053 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -331,6 +331,7 @@ void slurm_free_job_launch_msg(batch_job_launch_msg_t * msg) if (msg) { xfree(msg->nodes); + xfree(msg->cpu_bind); xfree(msg->cpus_per_node); xfree(msg->cpu_count_reps); xfree(msg->script); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b86729039c354f04dc678e425f6941c2668cae0d..672d5d979f13cfd5d238b6dcda75bdc239371945 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -609,6 +609,8 @@ typedef struct batch_job_launch_msg { uint32_t gid; uint32_t nprocs; /* number of tasks in this job */ uint32_t num_cpu_groups;/* elements in below cpu arrays */ + uint16_t cpu_bind_type; /* Internal for slurmd/task_affinity */ + char *cpu_bind; /* Internal for slurmd/task_affinity */ uint16_t *cpus_per_node;/* cpus per node */ uint32_t *cpu_count_reps;/* how many nodes have same cpu count */ uint16_t cpus_per_task; /* number of CPUs requested per task */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 3f06ed14da606d468fd81254cab083b8243d3462..c1d9b6366ad6a16723ad83f7ef02bdc1ac4f42b2 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -4118,6 +4118,7 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer) pack8(msg->overcommit, buffer); pack16(msg->acctg_freq, buffer); + pack16(msg->cpu_bind_type, buffer); pack16(msg->cpus_per_task, buffer); pack16(msg->restart_cnt, buffer); @@ -4127,8 +4128,9 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer) pack32_array(msg->cpu_count_reps, msg->num_cpu_groups, buffer); } - packstr(msg->nodes, buffer); - packstr(msg->script, buffer); + packstr(msg->cpu_bind, buffer); + packstr(msg->nodes, buffer); + packstr(msg->script, buffer); packstr(msg->work_dir, buffer); packstr(msg->err, buffer); @@ -4168,6 +4170,7 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer) safe_unpack8(&launch_msg_ptr->overcommit, buffer); safe_unpack16(&launch_msg_ptr->acctg_freq, buffer); + safe_unpack16(&launch_msg_ptr->cpu_bind_type, buffer); safe_unpack16(&launch_msg_ptr->cpus_per_task, buffer); safe_unpack16(&launch_msg_ptr->restart_cnt, buffer); @@ -4182,7 +4185,8 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer) if (launch_msg_ptr->num_cpu_groups != uint32_tmp) goto unpack_error; } - + + safe_unpackstr_xmalloc(&launch_msg_ptr->cpu_bind, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&launch_msg_ptr->nodes, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&launch_msg_ptr->script, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&launch_msg_ptr->work_dir, &uint32_tmp, buffer); diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c index 7744575c694c3c5d338341d781b51ef1f268a118..d29622bd5f68eed0b0e19e8b2c2680b8bfac5676 100644 --- a/src/plugins/task/affinity/dist_tasks.c +++ b/src/plugins/task/affinity/dist_tasks.c @@ -53,6 +53,8 @@ static char *_alloc_mask(launch_tasks_request_msg_t *req, static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, uint16_t *hw_sockets, uint16_t *hw_cores, uint16_t *hw_threads); +static int _get_local_node_info(slurm_cred_arg_t *arg, uint32_t job_node_id, + uint16_t *sockets, uint16_t *cores); static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p); @@ -157,6 +159,82 @@ static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks) } #endif +/* + * batch_bind - Set the batch request message so as to bind the shell to the + * proper resources + */ +void batch_bind(batch_job_launch_msg_t *req) +{ + bitstr_t *req_map, *hw_map; + slurm_cred_arg_t arg; + uint16_t sockets=0, cores=0, num_procs; + int hw_size, start, p, t, task_cnt=0; + char *str; + + if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { + error("task/affinity: job lacks a credential"); + return; + } + start = _get_local_node_info(&arg, 0, &sockets, &cores); + if (start != 0) { + error("task/affinity: missing node 0 in job credential"); + slurm_cred_free_args(&arg); + return; + } + + hw_size = conf->sockets * conf->cores * conf->threads; + num_procs = MIN((sockets * cores), + (conf->sockets * conf->cores)); + req_map = (bitstr_t *) bit_alloc(num_procs); + hw_map = (bitstr_t *) bit_alloc(hw_size); + if (!req_map || !hw_map) { + error("task/affinity: malloc error"); + bit_free(req_map); + bit_free(hw_map); + slurm_cred_free_args(&arg); + } + + /* Transfer core_bitmap data to local req_map. + * The MOD function handles the case where fewer processes + * physically exist than are configured (slurmd is out of + * sync with the slurmctld daemon). */ + for (p = 0; p < (sockets * cores); p++) { + if (bit_test(arg.core_bitmap, p)) + bit_set(req_map, (p % num_procs)); + } + str = (char *)bit_fmt_hexmask(req_map); + debug3("task/affinity: job %u CPU mask from slurmctld: %s", + req->job_id, str); + xfree(str); + + for (p = 0; p < num_procs; p++) { + if (bit_test(req_map, p) == 0) + continue; + /* core_bitmap does not include threads, so we + * add them here but limit them to what the job + * requested */ + for (t = 0; t < conf->threads; t++) { + uint16_t bit = p * conf->threads + t; + bit_set(hw_map, bit); + task_cnt++; + } + } + if (task_cnt) { + req->cpu_bind_type = CPU_BIND_MASK; + if (conf->task_plugin_param & CPU_BIND_VERBOSE) + req->cpu_bind_type |= CPU_BIND_VERBOSE; + req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); + info("task/affinity: job %u CPU final mask for node: %s", + req->job_id, req->cpu_bind); + } else { + error("task/affinity: job %u allocated not CPUs", + req->job_id); + } + bit_free(hw_map); + bit_free(req_map); + slurm_cred_free_args(&arg); +} + /* * lllp_distribution * @@ -299,8 +377,8 @@ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) } -/* helper function for _get_avail_map - * +/* + * _get_local_node_info - get job allocation details for this node * IN: req - launch request structure * IN: job_node_id - index of the local node in the job allocation * IN/OUT: sockets - pointer to socket count variable @@ -467,6 +545,7 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, if (start < 0) { error("task/affinity: missing node %u in job credential", job_node_id); + slurm_cred_free_args(&arg); return NULL; } debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u", @@ -480,15 +559,17 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, error("task/affinity: malloc error"); bit_free(req_map); bit_free(hw_map); + slurm_cred_free_args(&arg); return NULL; } /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ - for (p = 0; p < (sockets * cores); p++) + for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.core_bitmap, start+p)) bit_set(req_map, (p % num_procs)); + } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u.%u CPU mask from slurmctld: %s", @@ -499,7 +580,6 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, for (p = 0; p < num_procs; p++) { if (bit_test(req_map, p) == 0) continue; - bit_clear(req_map, p); /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ @@ -518,6 +598,7 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, xfree(str); bit_free(req_map); + slurm_cred_free_args(&arg); return hw_map; } diff --git a/src/plugins/task/affinity/dist_tasks.h b/src/plugins/task/affinity/dist_tasks.h index 24580a3dbd38b35a4675ac381fe1ec57edf44971..45d38a29f3366b30142e95175aeb4b997caec69c 100644 --- a/src/plugins/task/affinity/dist_tasks.h +++ b/src/plugins/task/affinity/dist_tasks.h @@ -38,6 +38,7 @@ #include <stdint.h> #include "src/common/slurm_protocol_defs.h" +void batch_bind(batch_job_launch_msg_t *req); void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id); #endif /* !_SLURMSTEPD_DIST_TASKS_H */ diff --git a/src/plugins/task/affinity/task_affinity.c b/src/plugins/task/affinity/task_affinity.c index 65ef0bd806bf8438bc9c5e155d8fb8b2340d7421..a5df89b5b785c6060f9c1b329e4d19b9849449d5 100644 --- a/src/plugins/task/affinity/task_affinity.c +++ b/src/plugins/task/affinity/task_affinity.c @@ -156,6 +156,17 @@ static void _update_bind_type(launch_tasks_request_msg_t *req) } } +/* + * task_slurmd_batch_request() + */ +extern int task_slurmd_batch_request (uint32_t job_id, + batch_job_launch_msg_t *req) +{ + info("task_slurmd_batch_request: %u", job_id); + batch_bind(req); + return SLURM_SUCCESS; +} + /* * task_slurmd_launch_request() */ @@ -185,9 +196,6 @@ extern int task_slurmd_launch_request (uint32_t job_id, info("task affinity : after lllp distribution cpu bind " "method is '%s' (%s)", buf_type, req->cpu_bind); } - - /* Remove the slurm msg timeout needs to be investigated some more */ - /* req->cpu_bind_type = CPU_BIND_NONE; */ return SLURM_SUCCESS; } diff --git a/src/plugins/task/none/task_none.c b/src/plugins/task/none/task_none.c index 5e852eab6596b349b6eebdf1efb47719b34bb228..cc2f96ab47b7cb9d81ce94c194dd3c88e2263b5d 100644 --- a/src/plugins/task/none/task_none.c +++ b/src/plugins/task/none/task_none.c @@ -98,6 +98,16 @@ extern int fini (void) return SLURM_SUCCESS; } +/* + * task_slurmd_batch_request() + */ +extern int task_slurmd_batch_request (uint32_t job_id, + batch_job_launch_msg_t *req) +{ + debug("task_slurmd_batch_request: %u", job_id); + return SLURM_SUCCESS; +} + /* * task_slurmd_launch_request() */ diff --git a/src/slurmd/common/set_oomadj.c b/src/slurmd/common/set_oomadj.c index b443cd993b1a9eaac45f9a77826520e031faea80..126b567cd9ba5f798a6c63bd93fe5d0f8ddafabb 100644 --- a/src/slurmd/common/set_oomadj.c +++ b/src/slurmd/common/set_oomadj.c @@ -50,7 +50,10 @@ extern int set_oom_adj(int adj) fd = open("/proc/self/oom_adj", O_WRONLY); if (fd < 0) { - verbose("failed to open /proc/self/oom_adj: %m"); + if (errno == ENOENT) + debug("failed to open /proc/self/oom_adj: %m"); + else + verbose("failed to open /proc/self/oom_adj: %m"); return -1; } if (snprintf(oom_adj, 16, "%d", adj) >= 16) { diff --git a/src/slurmd/common/task_plugin.c b/src/slurmd/common/task_plugin.c index 3841edb68e3aa66b3d850546cc98862a59077ce3..364340ddece96a80b91e4271b930bfbeca74d4e7 100644 --- a/src/slurmd/common/task_plugin.c +++ b/src/slurmd/common/task_plugin.c @@ -1,8 +1,8 @@ /*****************************************************************************\ - * task_plugin.h - task launch plugin stub. + * task_plugin.c - task launch plugin stub. ***************************************************************************** * Copyright (C) 2005-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -47,6 +47,8 @@ #include "src/slurmd/slurmstepd/slurmstepd_job.h" typedef struct slurmd_task_ops { + int (*slurmd_batch_request) (uint32_t job_id, + batch_job_launch_msg_t *req); int (*slurmd_launch_request) (uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id); @@ -81,6 +83,7 @@ _slurmd_task_get_ops(slurmd_task_context_t *c) * Must be synchronized with slurmd_task_ops_t above. */ static const char *syms[] = { + "task_slurmd_batch_request", "task_slurmd_launch_request", "task_slurmd_reserve_resources", "task_slurmd_suspend_job", @@ -231,6 +234,19 @@ extern int slurmd_task_fini(void) return rc; } +/* + * Slurmd has received a batch job launch request. + * + * RET - slurm error code + */ +extern int slurmd_batch_request(uint32_t job_id, batch_job_launch_msg_t *req) +{ + if (slurmd_task_init()) + return SLURM_ERROR; + + return (*(g_task_context->ops.slurmd_batch_request))(job_id, req); +} + /* * Slurmd has received a launch request. * diff --git a/src/slurmd/common/task_plugin.h b/src/slurmd/common/task_plugin.h index 121876569de2e577f7b4507ec9fd0ce4b82e6529..cdb6034f24de939a318db772ce63c0c983a3ac05 100644 --- a/src/slurmd/common/task_plugin.h +++ b/src/slurmd/common/task_plugin.h @@ -2,7 +2,7 @@ * task_plugin.h - Define plugin functions for task pre_launch and post_term. ***************************************************************************** * Copyright (C) 2005-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -61,6 +61,13 @@ extern int slurmd_task_fini(void); ************************************************************************** */ +/* + * Slurmd has received a batch job launch request. + * + * RET - slurm error code + */ +extern int slurmd_batch_request(uint32_t job_id, batch_job_launch_msg_t *req); + /* * Slurmd has received a launch request. * diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index c7746d44fc65291f12ed3b434a1014d292bb0dfb..3d23a2d923d6a5aa116b8698ef2111465e278ba0 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1006,6 +1006,8 @@ _rpc_batch_job(slurm_msg_t *msg) goto done; } + slurmd_batch_request(req->job_id, req); /* determine task affinity */ + if ((req->step_id != SLURM_BATCH_SCRIPT) && (req->step_id != 0)) first_job_run = false; diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index f5980366f59c2fe19f2c1a84b2f0d59bc4a5ab4b..a51ceb86480ad8202e76622821a80a9580e119e1 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -2,7 +2,8 @@ * src/slurmd/slurmstepd/slurmstepd.c - SLURM job-step manager. * $Id$ ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Danny Auble <da@llnl.gov> * and Christopher Morrone <morrone2@llnl.gov>. diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index 950f3ec42f09f38b168d0deeef8e93f6ebd860a3..4c7b9a6ffbec2dac7bf623d0749b937866124aa0 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -355,8 +355,8 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->envtp->nodeid = -1; job->envtp->distribution = 0; - job->envtp->cpu_bind_type = 0; - job->envtp->cpu_bind = NULL; + job->cpu_bind_type = msg->cpu_bind_type; + job->cpu_bind = xstrdup(msg->cpu_bind); job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_path = NULL;