From cc0f39dfdd4d8db5312bf6abccfad87c5df1829c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 5 Jul 2006 16:59:51 +0000 Subject: [PATCH] svn merge -r8457:8480 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 13 +- doc/html/configurator.html | 8 +- doc/html/switchplugins.shtml | 34 ++- doc/man/man5/slurm.conf.5 | 13 +- slurm/slurm.h.in | 1 - src/api/step_client_io.c | 17 +- src/common/bitstring.h | 10 +- src/common/fd.c | 11 + src/common/slurm_protocol_defs.c | 4 - src/common/slurm_protocol_defs.h | 1 - src/common/slurm_protocol_pack.c | 2 - src/common/switch.c | 23 ++ src/common/switch.h | 20 +- src/plugins/switch/elan/switch_elan.c | 13 +- src/plugins/switch/federation/federation.c | 129 +++++++++-- .../switch/federation/switch_federation.c | 21 +- src/plugins/switch/none/switch_none.c | 13 +- src/slurmctld/job_mgr.c | 201 ++++++------------ src/slurmctld/node_mgr.c | 16 +- src/slurmctld/node_scheduler.c | 4 +- src/slurmctld/proc_req.c | 25 +-- src/slurmctld/slurmctld.h | 15 +- src/slurmctld/step_mgr.c | 182 +++++++++++++--- src/slurmd/slurmd/req.c | 4 +- src/slurmd/slurmstepd/mgr.c | 4 +- src/slurmd/slurmstepd/req.c | 8 + src/smap/job_functions.c | 1 - src/squeue/print.c | 1 - src/srun/opt.c | 2 +- testsuite/expect/globals | 1 + testsuite/expect/test14.4 | 11 +- testsuite/slurm_unit/common/bitstring-test.c | 2 +- 32 files changed, 558 insertions(+), 252 deletions(-) diff --git a/NEWS b/NEWS index c81fc7053a1..134a3b8ca47 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,13 @@ documents those changes that are of interest to users and admins. the code) -- Added support for OSX build. +* Changes in SLURM 1.1.3 +======================== + -- Fix big-endian bug in the bitstring code which plagued AIX. + -- Fix bug in handling srun's --multi-prog option, could go off end of buffer. + -- Added support for job step completion (and switch window release) on + subset of allocated nodes. + * Changes in SLURM 1.1.2 ======================== -- Fix bug in jobcomp/filetxt plugin to report proper NodeCnt when a job @@ -39,9 +46,6 @@ documents those changes that are of interest to users and admins. completed. -- BLUEGENE - added configure option --with-bg-link to choose dynamic linking or static linking with the bridgeapi. - -- Fix to make sure all steps are complete on job before removing allocation - -- send SIGKILL to all steps when an allocation has been completed. - -- new job state JOB_DEALLOCATING * Changes in SLURM 1.1.1 ======================== @@ -197,6 +201,9 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.0.15 ========================= + -- In srun, reset stdin to blocking mode (if it was originally blocking before + we set it to O_NONBLOCK) on exit to avoid trouble with things like running + srun under a bash shell in an emacs *shell* buffer. * Changes in SLURM 1.0.14 ========================= diff --git a/doc/html/configurator.html b/doc/html/configurator.html index e95b3439495..2654e9e596e 100644 --- a/doc/html/configurator.html +++ b/doc/html/configurator.html @@ -280,12 +280,12 @@ Slurmctld state save directory <input type="text" name="slurmd_spool_dir" value="/tmp/slurmd"> <B>SlurmdSpoolDir</B>: Slurmd state save directory <P> -Define when a DOWN node is returned to service.<BR> +Define when a non-responding (DOWN) node is returned to service.<BR> Select one value for <B>ReturnToService</B>:<BR> +<input type="radio" name="return_to_service" value="0" checked> +<B>0</B>: When explicitly restored to service by an administrator.<BR> <input type="radio" name="return_to_service" value="1"> <B>1</B>: Automatically, when slurmd daemon registers with valid configuration<BR> -<input type="radio" name="return_to_service" value="0" checked> -<B>0</B>: When explicitly restored to service by an administrator. <P> <H2>Scheduling</H2> @@ -464,6 +464,6 @@ before terminating all remaining tasks. A value of zero indicates unlimited wait </FORM> <HR> <p class="footer">UCRL-WEB-217616<br> -Last modified 6 June 2006</p> +Last modified 5 July 2006</p> </BODY> diff --git a/doc/html/switchplugins.shtml b/doc/html/switchplugins.shtml index eedf4c676bc..9703dba8a53 100644 --- a/doc/html/switchplugins.shtml +++ b/doc/html/switchplugins.shtml @@ -285,10 +285,10 @@ to indicate the reason for failure.</p> <p class="commandline">int switch_p_job_step_complete (switch_jobinfo_t switch_job, char *nodelist);</p> <p style="margin-left:.2in"><b>Description</b>: Note that the job step associated -with the specified node has completed execution.</p> -<p style="margin-left:.2in"><b>Arguments</b>:<span class="commandline"> switch_job</span> - - (input) The completed job's switch credential.<br> +with the specified nodelist has completed execution.</p> +<p style="margin-left:.2in"><b>Arguments</b>:<br> +<span class="commandline"> switch_job</span> (input) +The completed job's switch credential.<br> <span class="commandline"> nodelist</span> (input) A list of nodes on which the job has completed. This may contain expressions to specify node ranges. (e.g. "linux[1-20]" or "linux[2,4,6,8]").</p> @@ -296,6 +296,30 @@ on which the job has completed. This may contain expressions to specify node ran the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> +<p class="commandline">int switch_p_job_step_part_comp (switch_jobinfo_t switch_job, +char *nodelist);</p> +<p style="margin-left:.2in"><b>Description</b>: Note that the job step has completed +execution on the specified node list. The job step is not necessarily completed on all +nodes, but switch resources associated with it on the specified nodes are no longer +in use.</p> +<p style="margin-left:.2in"><b>Arguments</b>:<br> +<span class="commandline"> switch_job</span> (input) +The completed job's switch credential.<br> +<span class="commandline"> nodelist</span> (input) A list of nodes +on which the job step has completed. This may contain expressions to specify node ranges. +(e.g. "linux[1-20]" or "linux[2,4,6,8]").</p> +<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, +the plugin should return SLURM_ERROR and set the errno to an appropriate value +to indicate the reason for failure.</p> + +<p class="commandline">bool switch_p_part_comp (void);</p> +<p style="margin-left:.2in"><b>Description</b>: Indicate if the switch plugin should +process partitial job step completions (i.e. switch_g_job_step_part_comp). Support +of partition completions is compute intensive, so it should be avoided unless switch +resources are in short supply (e.g. switch/federation).</p> +<p style="margin-left:.2in"><b>Returns</b>: True if partition step completions are +to be recorded. False if only full job step completions are to be noted.</p> + <p class="commandline">void switch_p_print_jobinfo(FILE *fp, switch_jobinfo_t switch_job);</p> <p style="margin-left:.2in"><b>Description</b>: Print the contents of a job's switch credential to a file.</p> @@ -495,6 +519,6 @@ plugin that transmitted it. It is at the discretion of the plugin author whether to maintain data format compatibility across different versions of the plugin.</p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 1 October 2005</p> +<p style="text-align:center;">Last modified 3 July 2006</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index a0c8d71750c..24b1dfe7cb4 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "June 2006" "slurm.conf 1.2" "Slurm configuration file" +.TH "slurm.conf" "5" "July 2006" "slurm.conf 1.1" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -305,10 +305,13 @@ appearing in this list. The user can override this by specifying which resource limits to propagate with the srun commands "--propagate" option. .TP \fBReturnToService\fR -If set to 1, then a DOWN node will become available for use -upon registration. The default value is 0, which -means that a node will remain in the DOWN state -until a system administrator explicitly changes its state +If set to 1, then a non-responding (DOWN) node will become available +for use upon registration. Note that DOWN node's state will be changed +only if it was set DOWN due to being non-responsive. If the node was +set DOWN for any other reason (low memory, prolog failure, epilog +failure, etc.), its state will not automatically be changed. The +default value is 0, which means that a node will remain in the +DOWN state until a system administrator explicitly changes its state (even if the slurmd daemon registers and resumes communications). .TP \fBSchedulerAuth\fR diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index dfce7e56137..e43918ced9d 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -144,7 +144,6 @@ BEGIN_C_DECLS enum job_states { JOB_PENDING, /* queued waiting for initiation */ JOB_RUNNING, /* allocated resources and executing */ - JOB_DEALLOCATING, /* Cleaning up allocation of job */ JOB_SUSPENDED, /* allocated resources, execution suspended */ JOB_COMPLETE, /* completed execution successfully */ JOB_CANCELLED, /* cancelled by user */ diff --git a/src/api/step_client_io.c b/src/api/step_client_io.c index 92d26d3cff4..7dfb750b860 100644 --- a/src/api/step_client_io.c +++ b/src/api/step_client_io.c @@ -1,8 +1,8 @@ /****************************************************************************\ - * io.c - process stdin, stdout, and stderr for parallel jobs. + * step_client_io.c - process stdin, stdout, and stderr for parallel jobs. * $Id$ ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <grondona@llnl.gov>, et. al. * UCRL-CODE-217948. @@ -170,6 +170,7 @@ struct file_read_info { uint32_t nodeid; bool eof; + bool was_blocking; }; @@ -695,6 +696,12 @@ create_file_read_eio_obj(int fd, uint32_t taskid, uint32_t nodeid, info->header.ltaskid = (uint16_t)-1; info->eof = false; + if (fd_is_blocking(fd)) { + fd_set_nonblocking(fd); + info->was_blocking = true; + } else { + info->was_blocking = false; + } eio = eio_obj_create(fd, &file_read_ops, (void *)info); return eio; @@ -717,6 +724,11 @@ static bool _file_readable(eio_obj_t *obj) } if (obj->shutdown == true) { debug3(" false, shutdown"); + /* if the file descriptor was in blocking mode before we set it + * to O_NONBLOCK, then set it back to blocking mode before + * closing */ + if (info->was_blocking) + fd_set_blocking(obj->fd); close(obj->fd); obj->fd = -1; info->eof = true; @@ -1015,7 +1027,6 @@ _init_stdio_eio_objs(client_io_fds_t fds, client_io_t *cio) * build stdin eio_obj_t */ if (fds.in.fd > -1) { - fd_set_nonblocking(fds.in.fd); fd_set_close_on_exec(fds.in.fd); cio->stdin_obj = create_file_read_eio_obj( fds.in.fd, fds.in.taskid, fds.in.nodeid, cio); diff --git a/src/common/bitstring.h b/src/common/bitstring.h index ca23991f3ff..432543287ac 100644 --- a/src/common/bitstring.h +++ b/src/common/bitstring.h @@ -36,6 +36,10 @@ * 32 bit words. * * bitstrings are zero origin + * + * bitstrings are always stored in a little-endian fashion. In other words, + * bit "1" is always in the byte of a word at the lowest memory address, + * regardless of the native architecture endianness. */ #ifndef _BITSTRING_H_ @@ -90,7 +94,11 @@ typedef bitstr_t bitoff_t; ((char *)((name) + BITSTR_OVERHEAD) + ((bit) >> BITSTR_SHIFT_WORD8)) /* mask for the bit within its word */ -#define _bit_mask(bit) ((bitstr_t)1 << ((bit)&BITSTR_MAXPOS)) +#ifdef SLURM_BIGENDIAN +#define _bit_mask(bit) ((bitstr_t)1 << (BITSTR_MAXPOS - ((bit)&BITSTR_MAXPOS))) +#else +#define _bit_mask(bit) ((bitstr_t)1 << ((bit)&BITSTR_MAXPOS)) +#endif /* number of bits actually allocated to a bitstr */ #define _bitstr_bits(name) ((name)[1]) diff --git a/src/common/fd.c b/src/common/fd.c index 1ed8201ef09..89a282b3eaa 100644 --- a/src/common/fd.c +++ b/src/common/fd.c @@ -61,6 +61,17 @@ void fd_set_noclose_on_exec(int fd) return; } +int fd_is_blocking(int fd) +{ + int val = 0; + + assert(fd >= 0); + + if ((val = fcntl(fd, F_GETFL, 0)) < 0) + error("fnctl(F_GET_FL) failed: %m"); + return (val & O_NONBLOCK) ? 0 : 1; +} + void fd_set_nonblocking(int fd) { int fval; diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 41471d31916..2ceeef93358 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -520,8 +520,6 @@ char *job_state_string(enum job_states inx) return "PENDING"; case JOB_RUNNING: return "RUNNING"; - case JOB_DEALLOCATING: - return "DEALLOCATING"; case JOB_SUSPENDED: return "SUSPENDED"; case JOB_COMPLETE: @@ -549,8 +547,6 @@ char *job_state_string_compact(enum job_states inx) return "PD"; case JOB_RUNNING: return "R"; - case JOB_DEALLOCATING: - return "DE"; case JOB_SUSPENDED: return "S"; case JOB_COMPLETE: diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 051b79265c9..cf10651182a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -313,7 +313,6 @@ typedef struct complete_job_allocation { typedef struct complete_batch_script { uint32_t job_id; - uint32_t step_id; uint32_t job_rc; uint32_t slurm_rc; char *node_name; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 70b86d22ec2..54e9f3b89d3 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2834,7 +2834,6 @@ _pack_complete_batch_script_msg( complete_batch_script_msg_t * msg, Buf buffer) { pack32((uint32_t)msg->job_id, buffer); - pack32((uint32_t)msg->step_id, buffer); pack32((uint32_t)msg->job_rc, buffer); pack32((uint32_t)msg->slurm_rc, buffer); packstr(msg->node_name, buffer); @@ -2851,7 +2850,6 @@ _unpack_complete_batch_script_msg( *msg_ptr = msg; safe_unpack32(&msg->job_id, buffer); - safe_unpack32(&msg->step_id, buffer); safe_unpack32(&msg->job_rc, buffer); safe_unpack32(&msg->slurm_rc, buffer); safe_unpackstr_xmalloc(&msg->node_name, &uint16_tmp, buffer); diff --git a/src/common/switch.c b/src/common/switch.c index b81ebc5a8b8..935e3e6a5c9 100644 --- a/src/common/switch.c +++ b/src/common/switch.c @@ -94,6 +94,9 @@ typedef struct slurm_switch_ops { char *buf, size_t size ); int (*step_complete) ( switch_jobinfo_t jobinfo, char *nodelist ); + int (*step_part_comp) ( switch_jobinfo_t jobinfo, + char *nodelist ); + bool (*part_comp) ( void ); int (*step_allocated) ( switch_jobinfo_t jobinfo, char *nodelist ); int (*state_clear) ( void ); @@ -202,6 +205,8 @@ _slurm_switch_get_ops( slurm_switch_context_t c ) "switch_p_free_node_info", "switch_p_sprintf_node_info", "switch_p_job_step_complete", + "switch_p_job_step_part_comp", + "switch_p_part_comp", "switch_p_job_step_allocated", "switch_p_libstate_clear", "switch_p_slurmctld_init", @@ -549,6 +554,24 @@ extern int switch_g_job_step_complete(switch_jobinfo_t jobinfo, return (*(g_context->ops.step_complete))( jobinfo, nodelist ); } +extern int switch_g_job_step_part_comp(switch_jobinfo_t jobinfo, + char *nodelist) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.step_part_comp))( jobinfo, nodelist ); +} + +extern bool switch_g_part_comp(void) +{ + if ( switch_init() < 0 ) + return false; + + return (*(g_context->ops.part_comp))( ); +} + + extern int switch_g_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) { diff --git a/src/common/switch.h b/src/common/switch.h index b1f32167012..27747567fa3 100644 --- a/src/common/switch.h +++ b/src/common/switch.h @@ -157,12 +157,30 @@ extern int switch_g_get_jobinfo(switch_jobinfo_t jobinfo, int data_type, void *data); /* - * Note that the job step associated with the specified node + * Note that the job step associated with the specified nodelist * has completed execution. */ extern int switch_g_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist); +/* + * Note that the job step has completed execution on the specified + * nodelist. The job step is not necessarily completed on all + * nodes, but switch resources associated with it on the specified + * nodes are no longer in use. + */ +extern int switch_g_job_step_part_comp(switch_jobinfo_t jobinfo, + char *nodelist); + +/* + * Return TRUE if the switch plugin processes partial job step + * completion calls (i.e. switch_g_job_step_part_comp). Support + * of partition completions is compute intensive, so it should + * be avoided unless switch resources are in short supply (e.g. + * switch/federation). Otherwise return FALSE. + */ +extern bool switch_g_part_comp(void); + /* * Restore the switch allocation information "jobinfo" for an already * allocated job step, most likely to restore the switch information diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c index 1d96ac15d08..41cf0e0936b 100644 --- a/src/plugins/switch/elan/switch_elan.c +++ b/src/plugins/switch/elan/switch_elan.c @@ -2,7 +2,7 @@ * switch_elan.c - Library routines for initiating jobs on QsNet. * $Id$ ***************************************************************************** - * Copyright (C) 2003 The Regents of the University of California. + * Copyright (C) 2003-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Kevin Tew <tew1@llnl.gov>, et. al. * UCRL-CODE-217948. @@ -804,6 +804,17 @@ extern int switch_p_job_step_complete(switch_jobinfo_t jobinfo, return SLURM_SUCCESS; } +extern int switch_p_job_step_part_comp(switch_jobinfo_t jobinfo, + char *nodelist) +{ + return SLURM_SUCCESS; +} + +extern bool switch_p_part_comp(void) +{ + return false; +} + extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) { return qsw_restore_jobinfo((qsw_jobinfo_t) jobinfo); diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 8be66221d6e..c2916d73c0d 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -82,6 +82,7 @@ mode_t fed_umask; typedef struct fed_window { uint16_t id; uint32_t status; + uint16_t job_key; } fed_window_t; typedef struct fed_adapter { @@ -126,6 +127,9 @@ struct fed_jobinfo { uint8_t bulk_xfer; /* flag */ uint16_t tables_per_task; fed_tableinfo_t *tableinfo; + + hostlist_t nodenames; + int num_tasks; }; typedef struct { @@ -884,6 +888,7 @@ fed_pack_nodeinfo(fed_nodeinfo_t *n, Buf buf) for(j = 0; j < a->window_count; j++) { pack16(a->window_list[j].id, buf); pack32(a->window_list[j].status, buf); + pack16(a->window_list[j].job_key, buf); } } @@ -953,8 +958,7 @@ _hash_index (char *name) return index; } -/* Tries to find a node fast using the hash table if possible, - * otherwise falls back to a linear search. +/* Tries to find a node fast using the hash table * * Used by: slurmctld */ @@ -1208,8 +1212,11 @@ _unpack_nodeinfo(fed_nodeinfo_t *n, Buf buf, bool believe_window_status) for(j = 0; j < tmp_a->window_count; j++) { safe_unpack16(&tmp_w[j].id, buf); safe_unpack32(&tmp_w[j].status, buf); - if (!believe_window_status) + safe_unpack16(&tmp_w[j].job_key, buf); + if (!believe_window_status) { tmp_w[j].status = NTBL_UNLOADED_STATE; + tmp_w[j].job_key = 0; + } } tmp_a->window_list = tmp_w; } @@ -1340,7 +1347,7 @@ _find_window(fed_adapter_t *adapter, int window_id) { */ static int _allocate_windows_all(int adapter_cnt, fed_tableinfo_t *tableinfo, - char *hostname, int task_id) + char *hostname, int task_id, uint16_t job_key) { fed_nodeinfo_t *node; fed_adapter_t *adapter; @@ -1367,6 +1374,7 @@ _allocate_windows_all(int adapter_cnt, fed_tableinfo_t *tableinfo, return SLURM_ERROR; } window->status = NTBL_LOADED_STATE; + window->job_key = job_key; table = tableinfo[i].table[task_id]; table->task_id = task_id; @@ -1390,7 +1398,7 @@ _allocate_windows_all(int adapter_cnt, fed_tableinfo_t *tableinfo, */ static int _allocate_window_single(char *adapter_name, fed_tableinfo_t *tableinfo, - char *hostname, int task_id) + char *hostname, int task_id, uint16_t job_key) { fed_nodeinfo_t *node; fed_adapter_t *adapter = NULL; @@ -1432,6 +1440,7 @@ _allocate_window_single(char *adapter_name, fed_tableinfo_t *tableinfo, return SLURM_ERROR; } window->status = NTBL_LOADED_STATE; + window->job_key = job_key; table = tableinfo[0].table[task_id]; table->task_id = task_id; @@ -1452,7 +1461,8 @@ _allocate_window_single(char *adapter_name, fed_tableinfo_t *tableinfo, */ static int _window_state_set(int adapter_cnt, fed_tableinfo_t *tableinfo, - char *hostname, int task_id, enum NTBL_RC state) + char *hostname, int task_id, enum NTBL_RC state, + uint16_t job_key) { fed_nodeinfo_t *node = NULL; fed_adapter_t *adapter = NULL; @@ -1511,8 +1521,11 @@ _window_state_set(int adapter_cnt, fed_tableinfo_t *tableinfo, adapter->name, table->lid, table->window_id, task_id); window = _find_window(adapter, table->window_id); - if (window) + if (window) { window->status = state; + window->job_key = + (state == NTBL_UNLOADED_STATE) ? 0 : job_key; + } } return SLURM_SUCCESS; @@ -1617,7 +1630,7 @@ _job_step_window_state(fed_jobinfo_t *jp, hostlist_t hl, enum NTBL_RC state) rc = _window_state_set(jp->tables_per_task, jp->tableinfo, host, proc_cnt, - state); + state, jp->job_key); proc_cnt++; } free(host); @@ -1628,15 +1641,101 @@ _job_step_window_state(fed_jobinfo_t *jp, hostlist_t hl, enum NTBL_RC state) return SLURM_SUCCESS; } -/* Find all of the windows used by job step "jp" and mark their - * state NTBL_UNLOADED_STATE. +/* + * For one node, free all of the windows belonging to a particular + * job step (as identified by the job_key). + */ +static void inline +_free_windows_by_job_key(uint16_t job_key, char *nodename) +{ + fed_nodeinfo_t *node; + fed_adapter_t *adapter; + fed_window_t *window; + int i, j; + + /* debug3("_free_windows_by_job_key(%hu, %s)", job_key, nodename); */ + if ((node = _find_node(fed_state, nodename)) == NULL) + return; + + if (node->adapter_list == NULL) { + error("_free_windows_by_job_key, " + "adapter_list NULL for node %s", nodename); + return; + } + for (i = 0; i < node->adapter_count; i++) { + adapter = &node->adapter_list[i]; + if (adapter->window_list == NULL) { + error("_free_windows_by_job_key, " + "window_list NULL for node %s adapter %s", + nodename, adapter->name); + continue; + } + /* We could check here to see if this adapter's name + * is in the fed_jobinfo tablinfo list to avoid the next + * loop if the adapter isn't in use by the job step. + * However, the added searching and string comparisons + * probably aren't worth it, especially since MOST job + * steps will use all of the adapters. + */ + for (j = 0; j < adapter->window_count; j++) { + window = &adapter->window_list[j]; + + if (window->job_key == job_key) { + /* debug3("Freeing adapter %s window %d", + adapter->name, window->id); */ + window->status = NTBL_UNLOADED_STATE; + window->job_key = 0; + } + } + } +} + +/* Find all of the windows used by job step "jp" on the hosts + * designated in hostlist "hl" and mark their state NTBL_UNLOADED_STATE. * * Used by: slurmctld */ int fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) { - return _job_step_window_state(jp, hl, NTBL_UNLOADED_STATE); + enum NTBL_RC state = NTBL_UNLOADED_STATE; + hostlist_t uniq_hl; + hostlist_iterator_t hi; + char *nodename; + + xassert(!hostlist_is_empty(hl)); + xassert(jp); + xassert(jp->magic == FED_JOBINFO_MAGIC); + + if ((jp == NULL) + || (jp->magic != FED_JOBINFO_MAGIC) + || (hostlist_is_empty(hl))) + return SLURM_ERROR; + + if ((jp->tables_per_task == 0) + || !jp->tableinfo + || (jp->tableinfo[0].table_length == 0)) + return SLURM_SUCCESS; + + /* The hl hostlist may contain duplicate nodenames (poe -hostfile + * triggers duplicates in the hostlist). Since there + * is no reason to call _free_windows_by_job_key more than once + * per nodename, we create a new unique hostlist. + */ + uniq_hl = hostlist_copy(hl); + hostlist_uniq(uniq_hl); + hi = hostlist_iterator_create(uniq_hl); + + _lock(); + while((nodename = hostlist_next(hi)) != NULL) { + _free_windows_by_job_key(jp->job_key, nodename); + free(nodename); + } + _unlock(); + + hostlist_iterator_destroy(hi); + hostlist_destroy(uniq_hl); + return SLURM_SUCCESS; } @@ -1645,7 +1744,7 @@ fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) * * Used by the slurmctld at startup time to restore the allocation * status of any job steps that were running at the time the previous - * slurmctld was shutdown. Also used to restore teh allocation + * slurmctld was shutdown. Also used to restore the allocation * status after a call to switch_clear(). */ int @@ -1746,11 +1845,13 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs, if (adapter_name == NULL) { rc = _allocate_windows_all(jp->tables_per_task, jp->tableinfo, - host, proc_cnt); + host, proc_cnt, + jp->job_key); } else { rc = _allocate_window_single(adapter_name, jp->tableinfo, - host, proc_cnt); + host, proc_cnt, + jp->job_key); } if (rc != SLURM_SUCCESS) { _unlock(); diff --git a/src/plugins/switch/federation/switch_federation.c b/src/plugins/switch/federation/switch_federation.c index e07bf130512..897df04c179 100644 --- a/src/plugins/switch/federation/switch_federation.c +++ b/src/plugins/switch/federation/switch_federation.c @@ -3,7 +3,7 @@ ** Federation ** $Id$ ***************************************************************************** - * Copyright (C) 2004 The Regents of the University of California. + * Copyright (C) 2004-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Jason King <jking@llnl.gov> * UCRL-CODE-217948. @@ -454,7 +454,7 @@ extern int switch_p_get_jobinfo(switch_jobinfo_t switch_job, int key, return fed_get_jobinfo((fed_jobinfo_t *)switch_job, key, resulting_data); } -int switch_p_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist) +static inline int _make_step_comp(switch_jobinfo_t jobinfo, char *nodelist) { hostlist_t list = NULL; int rc; @@ -466,7 +466,22 @@ int switch_p_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist) return rc; } -int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) +extern int switch_p_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist) +{ + return _make_step_comp(job_info, nodelist); +} + +extern int switch_p_job_step_part_comp(switch_jobinfo_t jobinfo, char *nodelist) +{ + return _make_step_comp(job_info, nodelist); +} + +extern bool switch_p_part_comp(void) +{ + return true; +} + +extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) { hostlist_t list = NULL; int rc; diff --git a/src/plugins/switch/none/switch_none.c b/src/plugins/switch/none/switch_none.c index a58f4e22e44..bc15b22e212 100644 --- a/src/plugins/switch/none/switch_none.c +++ b/src/plugins/switch/none/switch_none.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * switch_none.c - Library for managing a switch with no special handling. ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-217948. @@ -278,6 +278,17 @@ extern int switch_p_job_step_complete(switch_jobinfo_t jobinfo, return SLURM_SUCCESS; } +extern int switch_p_job_step_part_comp(switch_jobinfo_t jobinfo, + char *nodelist) +{ + return SLURM_SUCCESS; +} + +extern bool switch_p_part_comp(void) +{ + return false; +} + extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e92ca1b4acd..f390eee23ba 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -129,7 +129,6 @@ static int _resume_job_nodes(struct job_record *job_ptr); static void _set_job_id(struct job_record *job_ptr); static void _set_job_prio(struct job_record *job_ptr); static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal); -static void _kill_signal_job(struct job_record *job_ptr); static void _signal_job(struct job_record *job_ptr, int signal); static void _suspend_job(struct job_record *job_ptr, uint16_t op); static int _suspend_job_nodes(struct job_record *job_ptr); @@ -174,14 +173,12 @@ struct job_record *create_job_record(int *error_code) xassert (job_ptr->magic = JOB_MAGIC); /* sets value */ job_ptr->details = detail_ptr; job_ptr->step_list = list_create(NULL); - job_ptr->suspended = false; - if (job_ptr->step_list == NULL) fatal("memory allocation failure"); xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */ detail_ptr->submit_time = time(NULL); - + if (list_append(job_list, job_ptr) == 0) fatal("list_append memory allocation failure"); @@ -1031,22 +1028,24 @@ extern int kill_job_by_part_name(char *part_name) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + bool suspended = false; if (job_ptr->part_ptr != part_ptr) continue; job_ptr->part_ptr = NULL; - if ((job_ptr->job_state == JOB_RUNNING) - || job_ptr->suspended) { + if (job_ptr->job_state == JOB_SUSPENDED) + suspended = true; + if ((job_ptr->job_state == JOB_RUNNING) || suspended) { job_count++; info("Killing job_id %u on defunct partition %s", job_ptr->job_id, part_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; - if (job_ptr->suspended) + if (suspended) job_ptr->end_time = job_ptr->suspend_time; else job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); - deallocate_nodes(job_ptr, false, job_ptr->suspended); + deallocate_nodes(job_ptr, false, suspended); } } @@ -1079,11 +1078,12 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + bool suspended = false; if ((job_ptr->node_bitmap == NULL) || (!bit_test(job_ptr->node_bitmap, bit_position))) continue; /* job not on this node */ if (job_ptr->job_state == JOB_SUSPENDED) - job_ptr->suspended = true; + suspended = true; if (job_ptr->job_state & JOB_COMPLETING) { job_count++; bit_clear(job_ptr->node_bitmap, bit_position); @@ -1100,8 +1100,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) error("Node %s comp_job_cnt underflow, " "JobId=%u", node_ptr->name, job_ptr->job_id); - } else if ((job_ptr->job_state == JOB_RUNNING) - || job_ptr->suspended) { + } else if ((job_ptr->job_state == JOB_RUNNING) || suspended) { if (step_test && (step_on_node(job_ptr, node_ptr) == 0)) continue; @@ -1115,14 +1114,12 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) job_ptr->job_id, node_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; - if (job_ptr->suspended) - job_ptr->end_time = - job_ptr->suspend_time; + if (suspended) + job_ptr->end_time = job_ptr->suspend_time; else job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); - deallocate_nodes(job_ptr, false, - job_ptr->suspended); + deallocate_nodes(job_ptr, false, suspended); } else { error("Removing failed node %s from job_id %u", node_name, job_ptr->job_id); @@ -1441,7 +1438,8 @@ extern int job_fail(uint32_t job_id) { struct job_record *job_ptr; time_t now = time(NULL); - + bool suspended = false; + job_ptr = find_job_record(job_id); if (job_ptr == NULL) { error("job_fail: invalid job id %u", job_id); @@ -1450,16 +1448,18 @@ extern int job_fail(uint32_t job_id) if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; - if ((job_ptr->job_state == JOB_RUNNING) || job_ptr->suspended) { + if (job_ptr->job_state == JOB_SUSPENDED) + suspended = true; + if ((job_ptr->job_state == JOB_RUNNING) || suspended) { /* No need to signal steps, deallocate kills them */ job_ptr->time_last_active = now; - if (job_ptr->suspended) - job_ptr->end_time = job_ptr->suspend_time; + if (suspended) + job_ptr->end_time = job_ptr->suspend_time; else - job_ptr->end_time = now; + job_ptr->end_time = now; last_job_update = now; job_ptr->job_state = JOB_FAILED | JOB_COMPLETING; - deallocate_nodes(job_ptr, false, job_ptr->suspended); + deallocate_nodes(job_ptr, false, suspended); job_completion_logger(job_ptr); return SLURM_SUCCESS; } @@ -1481,7 +1481,7 @@ extern int job_fail(uint32_t job_id) * last_job_update - time of last job table update */ extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t batch_flag, - uid_t uid) + uid_t uid) { struct job_record *job_ptr; time_t now = time(NULL); @@ -1511,14 +1511,23 @@ extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t batch_flag, if ((job_ptr->job_state == JOB_PENDING) && (signal == SIGKILL)) { - job_complete(job_id, uid, false, NO_VAL); + last_job_update = now; + job_ptr->job_state = JOB_CANCELLED; + job_ptr->start_time = now; + job_ptr->end_time = now; + job_completion_logger(job_ptr); + delete_job_details(job_ptr); verbose("job_signal of pending job %u successful", job_id); return SLURM_SUCCESS; } if ((job_ptr->job_state == JOB_SUSPENDED) && (signal == SIGKILL)) { - job_complete(job_id, uid, false, NO_VAL); + last_job_update = now; + job_ptr->end_time = job_ptr->suspend_time; + job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING; + deallocate_nodes(job_ptr, false, true); + job_completion_logger(job_ptr); verbose("job_signal %u of suspended job %u successful", signal, job_id); return SLURM_SUCCESS; @@ -1526,8 +1535,13 @@ extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t batch_flag, if (job_ptr->job_state == JOB_RUNNING) { if (signal == SIGKILL) { - job_ptr->time_last_active = now; - job_complete(job_id, uid, false, NO_VAL); + /* No need to signal steps, deallocate kills them */ + job_ptr->time_last_active = now; + job_ptr->end_time = now; + last_job_update = now; + job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING; + deallocate_nodes(job_ptr, false, false); + job_completion_logger(job_ptr); } else if (batch_flag) { if (job_ptr->batch_flag) _signal_batch_job(job_ptr, signal); @@ -1593,16 +1607,19 @@ _signal_batch_job(struct job_record *job_ptr, uint16_t signal) * last_job_update - time of last job table update */ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, - uint32_t job_return_code){ + uint32_t job_return_code) +{ struct job_record *job_ptr; time_t now = time(NULL); uint32_t job_comp_flag = 0; + bool suspended = false; + job_ptr = find_job_record(job_id); if (job_ptr == NULL) { info("job_complete: invalid JobId=%u", job_id); return ESLURM_INVALID_JOB_ID; } - + if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; @@ -1614,18 +1631,13 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, if (job_ptr->job_state & JOB_COMPLETING) return SLURM_SUCCESS; /* avoid replay */ - last_job_update = now; - - /* make sure all the steps know they are suppost to be done */ - if (job_ptr->job_state == JOB_RUNNING - || job_ptr->job_state == JOB_SUSPENDED) - _kill_signal_job(job_ptr); - - if (job_ptr->job_state == JOB_RUNNING - || job_ptr->job_state == JOB_SUSPENDED - || job_ptr->job_state == JOB_DEALLOCATING) + if (job_ptr->job_state == JOB_RUNNING) job_comp_flag = JOB_COMPLETING; - + if (job_ptr->job_state == JOB_SUSPENDED) { + job_comp_flag = JOB_COMPLETING; + suspended = true; + } + if (requeue && (job_ptr->batch_flag > 1)) { /* Failed one requeue, just kill it */ requeue = 0; @@ -1644,7 +1656,6 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, job_ptr->end_time = now; job_completion_logger(job_ptr); } else { - job_ptr->kill_on_step_done = 1; if (job_return_code == NO_VAL) job_ptr->job_state = JOB_CANCELLED| job_comp_flag; else if (job_return_code) @@ -1652,39 +1663,19 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, else if (job_comp_flag && /* job was running */ (job_ptr->end_time < now)) /* over time limit */ job_ptr->job_state = JOB_TIMEOUT | job_comp_flag; - else - job_ptr->job_state = JOB_DEALLOCATING; + else + job_ptr->job_state = JOB_COMPLETE | job_comp_flag; + if (suspended) + job_ptr->end_time = job_ptr->suspend_time; + else + job_ptr->end_time = now; + job_completion_logger(job_ptr); } - /* job was running */ - if (job_comp_flag) { - if (list_is_empty(job_ptr->step_list)) { - if (job_ptr->job_state == JOB_DEALLOCATING) - job_ptr->job_state = - JOB_COMPLETE | job_comp_flag; - if (job_ptr->suspended) - job_ptr->end_time = job_ptr->suspend_time; - else - job_ptr->end_time = now; - job_completion_logger(job_ptr); - deallocate_nodes(job_ptr, false, job_ptr->suspended); - info("job_complete for JobId=%u successful", job_id); - } else if (job_ptr->job_state != JOB_DEALLOCATING) { - if (job_ptr->suspended) - job_ptr->end_time = job_ptr->suspend_time; - else - job_ptr->end_time = now; - job_completion_logger(job_ptr); - deallocate_nodes(job_ptr, false, job_ptr->suspended); - info("1 job_complete for JobId=%u successful", job_id); - } else { - debug("%d job steps not complete", - list_count(job_ptr->step_list)); - } - } else { - info("job_complete for non-running JobId=%u successful", - job_id); - } + last_job_update = now; + if (job_comp_flag) /* job was running */ + deallocate_nodes(job_ptr, false, suspended); + info("job_complete for JobId=%u successful", job_id); return SLURM_SUCCESS; } @@ -3827,6 +3818,7 @@ extern bool job_epilog_complete(uint32_t job_id, char *node_name, } #endif + step_epilog_complete(job_ptr, node_name); if (!(job_ptr->job_state & JOB_COMPLETING)) { /* COMPLETED */ if ((job_ptr->job_state == JOB_PENDING) && (job_ptr->batch_flag)) { @@ -3943,14 +3935,13 @@ static void _signal_job(struct job_record *job_ptr, int signal) signal_job_msg_t *signal_job_msg = NULL; int i, buf_rec_size = 0; - debug3("signaling job %d with signal %d", job_ptr->job_id, signal); agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_JOB; agent_args->retry = 1; - signal_job_msg = xmalloc(sizeof(signal_job_msg_t)); + signal_job_msg = xmalloc(sizeof(kill_tasks_msg_t)); signal_job_msg->job_id = job_ptr->job_id; signal_job_msg->signal = signal; - + for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; @@ -3978,64 +3969,12 @@ static void _signal_job(struct job_record *job_ptr, int signal) xfree(agent_args); return; } - agent_args->msg_args = signal_job_msg; - agent_queue_request(agent_args); - return; -} - -/* Send specified SIGTERM to all steps associated with a job */ -static void _kill_signal_job(struct job_record *job_ptr) -{ - agent_arg_t *agent_args; - kill_job_msg_t *kill_job_msg; - int i, buf_rec_size = 0; - - debug3("sending SIGTERM to job %d", job_ptr->job_id); - agent_args = xmalloc(sizeof(agent_arg_t)); - - agent_args->msg_type = REQUEST_TERMINATE_JOB; - agent_args->retry = 1; - - kill_job_msg = xmalloc(sizeof(kill_job_msg_t)); - kill_job_msg->job_id = job_ptr->job_id; - kill_job_msg->job_uid = job_ptr->user_id; - kill_job_msg->nodes = xstrdup(job_ptr->nodes); - kill_job_msg->select_jobinfo = select_g_copy_jobinfo( - job_ptr->select_jobinfo); - - for (i = 0; i < node_record_count; i++) { - if (bit_test(job_ptr->node_bitmap, i) == 0) - continue; - if ((agent_args->node_count + 1) > buf_rec_size) { - buf_rec_size += 128; - xrealloc((agent_args->slurm_addr), - (sizeof(struct sockaddr_in) * - buf_rec_size)); - xrealloc((agent_args->node_names), - (MAX_SLURM_NAME * buf_rec_size)); - } - agent_args->slurm_addr[agent_args->node_count] = - node_record_table_ptr[i].slurm_addr; - strncpy(&agent_args-> - node_names[MAX_SLURM_NAME * agent_args->node_count], - node_record_table_ptr[i].name, MAX_SLURM_NAME); - agent_args->node_count++; -#ifdef HAVE_FRONT_END /* Operate only on front-end */ - break; -#endif - } - if (agent_args->node_count == 0) { - slurm_free_kill_job_msg(kill_job_msg); - xfree(agent_args); - return; - } - agent_args->msg_args = kill_job_msg; + agent_args->msg_args = signal_job_msg; agent_queue_request(agent_args); return; } - /* Send suspend request to slumrd of all nodes associated with a job */ static void _suspend_job(struct job_record *job_ptr, uint16_t op) { @@ -4099,7 +4038,7 @@ static int _suspend_job_nodes(struct job_record *job_ptr) if (node_ptr->run_job_cnt) (node_ptr->run_job_cnt)--; else { - error("1 Node %s run_job_cnt underflow", + error("Node %s run_job_cnt underflow", node_ptr->name); } if (job_ptr->details @@ -4107,7 +4046,7 @@ static int _suspend_job_nodes(struct job_record *job_ptr) if (node_ptr->no_share_job_cnt) (node_ptr->no_share_job_cnt)--; else { - error("1 Node %s no_share_job_cnt " + error("Node %s no_share_job_cnt " "underflow", node_ptr->name); } if (node_ptr->no_share_job_cnt == 0) @@ -4235,7 +4174,6 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, goto reply; _suspend_job(job_ptr, sus_ptr->op); job_ptr->job_state = JOB_SUSPENDED; - job_ptr->suspended = true; if (job_ptr->suspend_time) { job_ptr->pre_sus_time += difftime(now, @@ -4255,7 +4193,6 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, goto reply; _suspend_job(job_ptr, sus_ptr->op); job_ptr->job_state = JOB_RUNNING; - job_ptr->suspended = false; if (job_ptr->time_limit != INFINITE) { /* adjust effective time_limit */ job_ptr->end_time = now + diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 1656b6d852e..250346b0b13 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1708,14 +1708,15 @@ extern void make_node_comp(struct node_record *node_ptr, if (node_ptr->run_job_cnt) (node_ptr->run_job_cnt)--; else - error("2 Node %s run_job_cnt underflow", node_ptr->name); + error("Node %s run_job_cnt underflow in " + "make_node_comp", node_ptr->name); if (job_ptr->details && (job_ptr->details->shared == 0)) { if (node_ptr->no_share_job_cnt) (node_ptr->no_share_job_cnt)--; else - error("2 Node %s no_share_job_cnt underflow", - node_ptr->name); + error("Node %s no_share_job_cnt underflow in " + "make_node_comp", node_ptr->name); if (node_ptr->no_share_job_cnt == 0) bit_set(share_node_bitmap, inx); } @@ -1796,14 +1797,15 @@ void make_node_idle(struct node_record *node_ptr, if (node_ptr->run_job_cnt) (node_ptr->run_job_cnt)--; else - error("3 Node %s run_job_cnt underflow", - node_ptr->name); + error("Node %s run_job_cnt underflow in " + "make_node_idle", node_ptr->name); } else { if (node_ptr->comp_job_cnt) (node_ptr->comp_job_cnt)--; else - error("3 Node %s comp_job_cnt underflow, job_id %u", - node_ptr->name, job_ptr->job_id); + error("Node %s comp_job_cnt underflow in " + "make_node_idle, job_id %u", + node_ptr->name, job_ptr->job_id); if (node_ptr->comp_job_cnt > 0) return; /* More jobs completing */ } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 89dc3daf9a0..00cde811e13 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -149,7 +149,7 @@ extern int count_cpus(unsigned *bitmap) * node_record_table_ptr - pointer to global node table */ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, - bool suspended) + bool suspended) { int i; kill_job_msg_t *kill_job; @@ -215,6 +215,8 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if (agent_args->node_count == 0) { error("Job %u allocated no nodes to be killed on", job_ptr->job_id); + xfree(kill_job->nodes); + select_g_free_jobinfo(&kill_job->select_jobinfo); xfree(kill_job); xfree(agent_args); return; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 051aa126341..605937ac7b4 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -916,8 +916,8 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) /* init */ START_TIMER; - debug2("Processing RPC: REQUEST_COMPLETE_BATCH_SCRIPT %u.%u", - comp_msg->job_id, comp_msg->step_id); + debug2("Processing RPC: REQUEST_COMPLETE_BATCH_SCRIPT %u", + comp_msg->job_id); uid = g_slurm_auth_get_uid(msg->auth_cred); if (!_is_super_user(uid)) { @@ -932,19 +932,16 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) /* First set node DOWN if fatal error */ if (comp_msg->slurm_rc == ESLURM_ALREADY_DONE) { /* race condition on job termination, not a real error */ - info("slurmd error running JobId=%u.%u from node=%s: %s", - comp_msg->job_id, - comp_msg->step_id, - comp_msg->node_name, - slurm_strerror(comp_msg->slurm_rc)); + info("slurmd error running JobId=%u from node=%s: %s", + comp_msg->job_id, + comp_msg->node_name, + slurm_strerror(comp_msg->slurm_rc)); comp_msg->slurm_rc = SLURM_SUCCESS; } if (comp_msg->slurm_rc != SLURM_SUCCESS) { - error("Fatal slurmd error %u running JobId=%u.%u " - "on node=%s: %s", + error("Fatal slurmd error %u running JobId=%u on node=%s: %s", comp_msg->slurm_rc, comp_msg->job_id, - comp_msg->step_id, comp_msg->node_name, slurm_strerror(comp_msg->slurm_rc)); if (error_code == SLURM_SUCCESS) { @@ -961,10 +958,6 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) } } - /* ignore step complete will catch it on job_complete */ - job_step_complete(comp_msg->job_id, comp_msg->step_id, - uid, job_requeue, comp_msg->job_rc); - /* Mark job allocation complete */ error_code = job_complete(comp_msg->job_id, uid, job_requeue, comp_msg->job_rc); @@ -1732,8 +1725,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) lock_slurmctld(job_write_lock); error_code = job_allocate(job_desc_msg, - job_desc_msg->immediate, false, - false, uid, &job_ptr); + job_desc_msg->immediate, false, + false, uid, &job_ptr); unlock_slurmctld(job_write_lock); END_TIMER; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 679779ac0c7..17728a3f862 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -342,8 +342,6 @@ struct job_record { each of the ntask_cnt hosts */ uint16_t mail_type; /* see MAIL_JOB_* in slurm.h */ char *mail_user; /* user to get e-mail notification */ - bool suspended; /* marker to tell if job was - * suspended or not */ }; struct step_record { @@ -541,7 +539,8 @@ extern struct job_record *find_job_record (uint32_t job_id); */ extern struct node_record *find_first_node_record (bitstr_t *node_bitmap); -/* find_node_record - find a record for node with specified name */ +/* find_node_record - find a record for node with specified name, + * returns pointer to record or NULL if not found */ extern struct node_record *find_node_record (char *name); /* @@ -1160,6 +1159,16 @@ extern int step_create ( job_step_create_request_msg_t *step_specs, bool kill_job_when_step_done, bool batch_step ); +/* + * step_epilog_complete - note completion of epilog on some node and + * release it's switch windows if appropriate. can perform partition + * switch window releases. + * IN job_ptr - pointer to job which has completed epilog + * IN node_name - name of node which has completed epilog + */ +extern int step_epilog_complete(struct job_record *job_ptr, + char *node_name); + /* * step_on_node - determine if the specified job has any job steps allocated to * the specified node diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 90130b6a777..3242fde6b41 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -61,6 +61,11 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer); static bitstr_t * _pick_step_nodes (struct job_record *job_ptr, job_step_create_request_msg_t *step_spec ); +static hostlist_t _step_range_to_hostlist(struct step_record *step_ptr, + uint32_t range_first, uint32_t range_last); +static int _step_hostname_to_inx(struct step_record *step_ptr, + char *node_name); + /* * create_step_record - create an empty step_record for the specified job. * IN job_ptr - pointer to job table entry to have step record added @@ -251,8 +256,7 @@ int job_step_signal(uint32_t job_id, uint32_t step_id, if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; - if (job_ptr->job_state != JOB_RUNNING - && job_ptr->job_state != JOB_DEALLOCATING) { + if (job_ptr->job_state != JOB_RUNNING) { verbose("job_step_signal: step %u.%u can not be sent signal " "%u from state=%s", job_id, step_id, signal, job_state_string(job_ptr->job_state)); @@ -351,7 +355,6 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, struct job_record *job_ptr; struct step_record *step_ptr; int error_code; - int nodes; job_ptr = find_job_record(job_id); if (job_ptr == NULL) { @@ -362,42 +365,27 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, step_ptr = find_step_record(job_ptr, step_id); if (step_ptr == NULL) return ESLURM_INVALID_JOB_ID; + else + jobacct_g_step_complete_slurmctld(step_ptr); - if (step_ptr->exit_code == NO_VAL) { - /* initialize the node bitmap for exited nodes */ - nodes = bit_set_count(step_ptr->step_node_bitmap); - xassert(step_ptr->exit_node_bitmap == NULL); - step_ptr->exit_node_bitmap = bit_alloc(nodes); - if (step_ptr->exit_node_bitmap == NULL) - fatal("bit_alloc: %m"); - step_ptr->exit_code = job_return_code; - } + if ((job_ptr->kill_on_step_done) + && (list_count(job_ptr->step_list) <= 1) + && (!IS_JOB_FINISHED(job_ptr))) + return job_complete(job_id, uid, requeue, job_return_code); - jobacct_g_step_complete_slurmctld(step_ptr); - if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) { error("Security violation, JOB_COMPLETE RPC from uid %d", uid); return ESLURM_USER_ID_MISSING; } - last_job_update = time(NULL); + last_job_update = time(NULL); error_code = delete_step_record(job_ptr, step_id); if (error_code == ENOENT) { info("job_step_complete step %u.%u not found", job_id, step_id); return ESLURM_ALREADY_DONE; } - - debug2("have %d steps, kill %d, state %d %d", - list_count(job_ptr->step_list), - job_ptr->kill_on_step_done, - job_ptr->job_state, IS_JOB_FINISHED(job_ptr)); - if ((job_ptr->kill_on_step_done) - && (list_is_empty(job_ptr->step_list)) - && (!IS_JOB_FINISHED(job_ptr))) - return job_complete(job_id, uid, requeue, job_return_code); - return SLURM_SUCCESS; } @@ -998,7 +986,7 @@ extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr, rc = ESLURM_JOB_PENDING; goto reply; } else if ((job_ptr->job_state != JOB_RUNNING) - && (job_ptr->job_state != JOB_SUSPENDED)) { + && (job_ptr->job_state != JOB_SUSPENDED)) { rc = ESLURM_ALREADY_DONE; goto reply; } @@ -1034,7 +1022,7 @@ extern int step_partial_comp(step_complete_msg_t *req, int *rem, { struct job_record *job_ptr; struct step_record *step_ptr; - int nodes; + int nodes, rem_nodes; /* find the job, step, and validate input */ job_ptr = find_job_record (req->job_id); @@ -1045,16 +1033,22 @@ extern int step_partial_comp(step_complete_msg_t *req, int *rem, step_ptr = find_step_record(job_ptr, req->job_step_id); if (step_ptr == NULL) return ESLURM_INVALID_JOB_ID; - if (req->range_last < req->range_first) + if (req->range_last < req->range_first) { + error("step_partial_comp: range: %u-%u", req->range_first, + req->range_last); return EINVAL; + } jobacct_g_aggregate(step_ptr->jobacct, req->jobacct); if (step_ptr->exit_code == NO_VAL) { /* initialize the node bitmap for exited nodes */ nodes = bit_set_count(step_ptr->step_node_bitmap); - if (req->range_last >= nodes) /* range is zero origin */ + if (req->range_last >= nodes) { /* range is zero origin */ + error("step_partial_comp: last=%u, nodes=%d", + req->range_last, nodes); return EINVAL; + } xassert(step_ptr->exit_node_bitmap == NULL); step_ptr->exit_node_bitmap = bit_alloc(nodes); if (step_ptr->exit_node_bitmap == NULL) @@ -1063,18 +1057,144 @@ extern int step_partial_comp(step_complete_msg_t *req, int *rem, } else { xassert(step_ptr->exit_node_bitmap); nodes = _bitstr_bits(step_ptr->exit_node_bitmap); - if (req->range_last >= nodes) /* range is zero origin */ + if (req->range_last >= nodes) { /* range is zero origin */ + error("step_partial_comp: last=%u, nodes=%d", + req->range_last, nodes); return EINVAL; + } step_ptr->exit_code = MAX(step_ptr->exit_code, req->step_rc); } bit_nset(step_ptr->exit_node_bitmap, req->range_first, req->range_last); + rem_nodes = bit_clear_count(step_ptr->exit_node_bitmap); if (rem) - *rem = bit_clear_count(step_ptr->exit_node_bitmap); + *rem = rem_nodes; + if (rem_nodes == 0) { + /* release all switch windows */ + if (step_ptr->switch_job) { + debug2("full switch release for step %u.%u, " + "nodes %s", req->job_id, + req->job_step_id, + step_ptr->step_node_list); + switch_g_job_step_complete( + step_ptr->switch_job, + step_ptr->step_node_list); + switch_free_jobinfo (step_ptr->switch_job); + step_ptr->switch_job = NULL; + } + } else if (switch_g_part_comp() && step_ptr->switch_job) { + /* release switch windows on completed nodes, + * must translate range numbers to nodelist */ + hostlist_t hl; + char *node_list; + int new_size = 8096; + + hl = _step_range_to_hostlist(step_ptr, + req->range_first, req->range_last); + node_list = (char *) xmalloc(new_size); + while (hostlist_ranged_string(hl, new_size, + node_list) == -1) { + new_size *= 2; + xrealloc(node_list, new_size ); + } + debug2("partitial switch release for step %u.%u, " + "nodes %s", req->job_id, + req->job_step_id, node_list); + switch_g_job_step_part_comp( + step_ptr->switch_job, node_list); + hostlist_destroy(hl); + xfree(node_list); + } + if (max_rc) *max_rc = step_ptr->exit_code; return SLURM_SUCCESS; } +/* convert a range of nodes allocated to a step to a hostlist with + * names of those nodes */ +static hostlist_t _step_range_to_hostlist(struct step_record *step_ptr, + uint32_t range_first, uint32_t range_last) +{ + int i, node_inx = -1; + hostlist_t hl = hostlist_create(""); + + for (i = 0; i < node_record_count; i++) { + if (bit_test(step_ptr->step_node_bitmap, i) == 0) + continue; + node_inx++; + if ((node_inx >= range_first) + && (node_inx <= range_last)) { + hostlist_push(hl, + node_record_table_ptr[i].name); + } + } + + return hl; +} + +/* convert a single node name to it's offset within a step's + * nodes allocation. returns -1 on error */ +static int _step_hostname_to_inx(struct step_record *step_ptr, + char *node_name) +{ + struct node_record *node_ptr; + int i, node_inx, node_offset = 0; + + node_ptr = find_node_record(node_name); + if (node_ptr == NULL) + return -1; + node_inx = node_ptr - node_record_table_ptr; + + for (i = 0; i < node_inx; i++) { + if (bit_test(step_ptr->step_node_bitmap, i)) + node_offset++; + } + return node_offset; +} + +extern int step_epilog_complete(struct job_record *job_ptr, + char *node_name) +{ + int rc = 0, node_inx, step_offset; + ListIterator step_iterator; + struct step_record *step_ptr; + struct node_record *node_ptr; + + if (!switch_g_part_comp()) { + /* don't bother with partitial completions */ + return 0; + } + if ((node_ptr = find_node_record(node_name)) == NULL) + return 0; + node_inx = node_ptr - node_record_table_ptr; + + step_iterator = list_iterator_create(job_ptr->step_list); + while ((step_ptr = (struct step_record *) list_next (step_iterator))) { + if ((!step_ptr->switch_job) + || (bit_test(step_ptr->step_node_bitmap, node_inx) == 0)) + continue; + if (step_ptr->exit_node_bitmap) { + step_offset = _step_hostname_to_inx( + step_ptr, node_name); + if ((step_offset < 0) + || bit_test(step_ptr->exit_node_bitmap, + step_offset)) + continue; + bit_set(step_ptr->exit_node_bitmap, + step_offset); + } + rc++; + debug2("partitial switch release for step %u.%u, " + "epilog on %s", job_ptr->job_id, + step_ptr->step_id, node_name); + switch_g_job_step_part_comp( + step_ptr->switch_job, node_name); + } + list_iterator_destroy (step_iterator); + + return rc; +} + diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 340058c141c..6b21305edc7 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -665,8 +665,8 @@ _rpc_launch_tasks(slurm_msg_t *msg) } slurm_get_ip_str(cli, &port, host, sizeof(host)); - info("launch task %u.%u request from %u.%u@%s", req->job_id, - req->job_step_id, req->uid, req->gid, host); + info("launch task %u.%u request from %u.%u@%s (port %hu)", req->job_id, + req->job_step_id, req->uid, req->gid, host, port); if (_check_job_credential(req->cred, jobid, stepid, req_uid, req->tasks_to_launch[req->srun_node_id], diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index d8f31496d38..48a8002bea6 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1381,7 +1381,6 @@ _complete_batch_script(slurmd_job_t *job, int err, int status) complete_batch_script_msg_t req; req.job_id = job->jobid; - req.step_id = job->stepid; req.job_rc = status; req.slurm_rc = err; @@ -1392,8 +1391,7 @@ _complete_batch_script(slurmd_job_t *job, int err, int status) req_msg.ret_list = NULL; req_msg.forward_struct_init = 0; - info("sending REQUEST_COMPLETE_BATCH_SCRIPT %u.%u %d", - job->jobid, job->stepid, status); + info("sending REQUEST_COMPLETE_BATCH_SCRIPT"); /* Note: these log messages don't go to slurmd.log from here */ for (i=0; i<=MAX_RETRY; i++) { diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index a1c8073e5dc..b1527e4893e 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -995,6 +995,7 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid) int last; jobacctinfo_t *jobacct = NULL; int step_rc; +/* char bits_string[128]; */ debug("_handle_completion for job %u.%u", job->jobid, job->stepid); @@ -1021,9 +1022,16 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid) * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); +/* debug2("Setting range %d(bit %d) through %d(bit %d)", */ +/* first, first-(step_complete.rank+1), */ +/* last, last-(step_complete.rank+1)); */ +/* bit_fmt(bits_string, 128, step_complete.bits); */ +/* debug2(" before bits: %s", bits_string); */ bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); +/* bit_fmt(bits_string, 128, step_complete.bits); */ +/* debug2(" after bits: %s", bits_string); */ step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index 151a85365bb..7ac434bbfc4 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -83,7 +83,6 @@ extern void get_job() if ((job.job_state != JOB_PENDING) && (job.job_state != JOB_RUNNING) - && (job.job_state != JOB_DEALLOCATING) && (job.job_state != JOB_SUSPENDED) && ((job.job_state & JOB_COMPLETING) == 0)) continue; /* job has completed */ diff --git a/src/squeue/print.c b/src/squeue/print.c index b79668ab189..a37445ddff2 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -1110,7 +1110,6 @@ static int _filter_job(job_info_t * job) } else { if ((job->job_state != JOB_PENDING) && (job->job_state != JOB_RUNNING) - && (job->job_state != JOB_DEALLOCATING) && (job->job_state != JOB_SUSPENDED) && ((job->job_state & JOB_COMPLETING) == 0)) return 4; diff --git a/src/srun/opt.c b/src/srun/opt.c index 64b7d794d9b..08432d596cd 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -1554,7 +1554,7 @@ static void _load_multi(int *argc, char **argv) argv[0]); exit(1); } - data_buf = xmalloc(stat_buf.st_size); + data_buf = xmalloc(stat_buf.st_size + 1); while ((i = read(config_fd, &data_buf[data_read], stat_buf.st_size - data_read)) != 0) { if (i < 0) { diff --git a/testsuite/expect/globals b/testsuite/expect/globals index de40a675cae..8288d5d9bd7 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -90,6 +90,7 @@ cset bin_bash [exec which bash] cset bin_cat "cat" cset bin_cc "gcc" cset bin_chmod "chmod" +cset bin_cmp "cmp" cset bin_cp "cp" cset bin_diff "diff" cset bin_echo "echo" diff --git a/testsuite/expect/test14.4 b/testsuite/expect/test14.4 index b5206cd01af..07f176b4c20 100755 --- a/testsuite/expect/test14.4 +++ b/testsuite/expect/test14.4 @@ -52,20 +52,23 @@ if {[test_front_end] != 0} { # # Delete left-over stdout/err files # Build input script file that broacasts a file +# NOTE: we broadcast the file "sbcast", just for convenienc # set pid [pid] set file1 "/tmp/test.$pid.1.$test_id" set file2 "/tmp/test.$pid.2.$test_id" exec $bin_rm -f $file_out $file_err make_bash_script $file_in " - $srun $bin_touch $file1 + $srun rm -f $file1 + $srun $bin_echo dummy >$file1 $sbcast $sbcast $file1 - $srun $bin_diff $sbcast $file1 + $srun $bin_cmp $sbcast $file1 $srun $bin_rm -f $file1 - $srun $bin_touch $file2 + $srun rm -f $file2 + $srun $bin_echo dummy >$file2 $sbcast $sbcast --force $file2 - $srun $bin_diff $sbcast $file2 + $srun $bin_cmp $sbcast $file2 $srun $bin_rm -f $file2 " diff --git a/testsuite/slurm_unit/common/bitstring-test.c b/testsuite/slurm_unit/common/bitstring-test.c index 0e3338f2fd2..8fe079e50a1 100644 --- a/testsuite/slurm_unit/common/bitstring-test.c +++ b/testsuite/slurm_unit/common/bitstring-test.c @@ -8,7 +8,7 @@ /* Test for failure: */ #define TEST(_tst, _msg) do { \ - if (! _tst) \ + if (! (_tst)) \ fail( _msg ); \ else \ pass( _msg ); \ -- GitLab