From 6d684b773174945ae4bfeeb307d424d2d5bb0ee6 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 5 Jun 2007 22:11:13 +0000 Subject: [PATCH] Add new node state FAILING plus trigger for when node enters that state. --- NEWS | 1 + doc/man/man1/scontrol.1 | 11 ++++-- doc/man/man1/sinfo.1 | 63 ++++++++++++++++++----------- doc/man/man1/smap.1 | 34 +++++++++++----- doc/man/man1/strigger.1 | 6 ++- doc/man/man3/slurm_reconfigure.3 | 10 ++--- doc/man/man5/slurm.conf.5 | 46 +++++++++++++-------- slurm/slurm.h.in | 23 ++++++----- src/common/slurm_protocol_defs.c | 28 ++++++++++++- src/common/slurm_protocol_pack.c | 4 +- src/scontrol/update_node.c | 13 ++++-- src/slurmctld/node_mgr.c | 39 ++++++++++++------ src/slurmctld/read_config.c | 13 +++--- src/slurmctld/trigger_mgr.c | 68 +++++++++++++++++++++++++++++--- src/slurmctld/trigger_mgr.h | 1 + src/strigger/opts.c | 12 +++++- src/strigger/strigger.c | 11 ++++++ src/strigger/strigger.h | 1 + 18 files changed, 286 insertions(+), 98 deletions(-) diff --git a/NEWS b/NEWS index 601c1903e14..1d3841c103b 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,7 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.3.0-pre1 ============================= + -- Add new node state FAILING plus trigger for when node enters that state. * Changes in SLURM 1.2.10 diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 2da636117b9..fe3803cf539 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "March 2007" "scontrol 1.2" "Slurm components" +.TH SCONTROL "1" "May 2007" "scontrol 1.3" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -379,18 +379,21 @@ or reconfiguration. Update slurm.conf with any changes meant to be persistent. .TP \fIReason\fP=<reason> -Identify the reason the node is in a "DOWN" or "DRAINED" or "DRAINING" state. +Identify the reason the node is in a "DOWN" or "DRAINED", "DRAINING", +"FAILING" or "FAIL" state. Use quotes to enclose a reason having more than one word. .TP \fIState\fP=<state> Identify the state to be assigned to the node. Possible values are "NoResp", -"DRAIN" "RESUME", "DOWN", "IDLE", "ALLOC", and "ALLOCATED". +"ALLOC", "ALLOCATED", "DOWN", "DRAIN", "FAIL", "FAILING", "IDLE" or "RESUME". "RESUME is not an actual node state, but will return a DRAINED, DRAINING, or DOWN node to service, either IDLE or ALLOCATED state as appropriate. Setting a node "DOWN" will cause all running and suspended jobs on that node to be terminated. If you want to remove a node from service, you typically want to set it's state to "DRAIN". +"FAILING" is similar to "DRAIN" except that some applications will +seek to relinquish those nodes before the job completes. The "NoResp" state will only set the "NoResp" flag for a node without changing its underlying state. @@ -517,7 +520,7 @@ scontrol: quit .ec .SH "COPYING" -Copyright (C) 2002 The Regents of the University of California. +Copyright (C) 2002\-2007 The Regents of the University of California. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). UCRL\-CODE\-226842. .LP diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index c9a0326e9d8..cb18c8b9cbb 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -1,4 +1,4 @@ -.TH SINFO "1" "May 2007" "sinfo 1.2" "Slurm components" +.TH SINFO "1" "May 2007" "sinfo 1.3" "Slurm components" .SH "NAME" sinfo \- view information about SLURM nodes and partitions. @@ -166,7 +166,8 @@ Partition name Only user root may initiate jobs, "yes" or "no" .TP \fB%R\fR -The reason a node is unavailable (down, drained, or draining states) +The reason a node is unavailable (down, drained, draining, +fail or failing states) .TP \fB%s\fR Maximum job size in nodes @@ -209,15 +210,17 @@ If set only report state information for responding nodes. .TP \fB\-R\fR, \fB\-\-list\-reasons\fR -List reasons nodes are down or drained. When nodes are in -these states SLURM supports optional inclusion of a "reason" -string by an administrator. This option will display the first -35 characters of the reason field and list of nodes with that -reason for all nodes that are, by default, down, drained, or -draining. This option may be used with other node filtering -options (e.g. \fB\-r\fR, \fB\-d\fR, \fB\-t\fR, \fB\-n\fR), -however, combinations of these options that result in a list of -nodes that are not down or drained will not produce any output. +List reasons nodes are in the down, drained, fail or failing state. +When nodes are in these states SLURM supports optional inclusion +of a "reason" string by an administrator. +This option will display the first 35 characters of the reason +field and list of nodes with that reason for all nodes that are, +by default, down, drained, draining or failing. +This option may be used with other node filtering options +(e.g. \fB\-r\fR, \fB\-d\fR, \fB\-t\fR, \fB\-n\fR), +however, combinations of these options that result in a +list of nodes that are not down or drained or failing will +not produce any output. When used with \fB\-l\fR the output additionally includes the current node state. @@ -248,11 +251,12 @@ default sort value is "N" (increasing node name). List nodes only having the given state(s). Multiple states may be comma separated and the comparison is case insensitive. Possible values include (case insensitive): ALLOC, ALLOCATED, -COMP, COMPLETING, DOWN, DRAIN, DRAINED, DRNG, DRAINING, IDLE, -UNK, and UNKNOWN. By default nodes in the specified state are -reported whether they are responding or not. The \fB\-\-dead\fR -and \fB\-\-responding\fR options may be used to filtering nodes by -the responding flag. +COMP, COMPLETING, DOWN, DRAIN, DRAINED, DRNG, DRAINING, FAIL, +FAILING, IDLE, UNK, and UNKNOWN. +By default nodes in the specified state are reported whether +they are responding or not. +The \fB\-\-dead\fR and \fB\-\-responding\fR options may be +used to filtering nodes by the responding flag. .TP \fB\-\-usage\fR @@ -335,11 +339,13 @@ shared. \fByes\fR indicates resource may be shared or not per job's resource allocation. .TP \fBSTATE\fR -State of the nodes. Possible states include: down, unknown, -idle, allocated, drained, draining, completing and their -abbreviated forms: down, unk, idle, alloc, drain, drng, and -comp respectively. Note that the suffix "*" identifies nodes -that are presently not responding. +State of the nodes. +Possible states include: allocated, completing, down, +drained, draining, fail, failing, idle, and unknown plus +their abbreviated forms: alloc, comp, donw, drain, drng, +fail, failg, idle, and unk respectively. +Note that the suffix "*" identifies nodes that are presently +not responding. .TP \fBTMP_DISK\fR Size of temporary disk space in megabytes on these nodes. @@ -351,7 +357,8 @@ If the node state code is followed by "*", this indicates the node is presently not responding and will not be allocated any new work. If the node remains non\-responsive, it will be placed in the \fBDOWN\fR state (except in the case of -\fBDRAINED\fR, \fBDRAINING\fR, or \fBCOMPLETING\fR nodes). +\fBCOMPLETING\fR, \fBDRAINED\fR, \fBDRAINING\fR, +\fBFAIL\fR, \fBFAILING\fR nodes). If the node state code is followed by "~", this indicates the node is presently in a power saving mode (typically running at reduced frequency). @@ -394,6 +401,18 @@ this state per system administrator request. See the \fBupdate node\fR command in the \fBscontrol\fR(1) man page or the \fBslurm.conf\fR(5) man page for more information. .TP +\fBFAIL\fR +The node is expected to fail soon and is unavailable for +use per system administrator request. +See the \fBupdate node\fR command in the \fBscontrol\fR(1) +man page or the \fBslurm.conf\fR(5) man page for more information. +.TP +\fBFAILING\fR +The node is currently executing a job, but is expected to fail +soon and is unavailable for use per system administrator request. +See the \fBupdate node\fR command in the \fBscontrol\fR(1) +man page or the \fBslurm.conf\fR(5) man page for more information. +.TP \fBIDLE\fR The node is not allocated to any jobs and is available for use. .TP diff --git a/doc/man/man1/smap.1 b/doc/man/man1/smap.1 index 85bfbd9ae48..edf3c01ce84 100644 --- a/doc/man/man1/smap.1 +++ b/doc/man/man1/smap.1 @@ -1,4 +1,4 @@ -.TH SMAP "1" "March 2006" "smap 1.1" "Slurm components" +.TH SMAP "1" "May 2007" "smap 1.3" "Slurm components" .SH "NAME" smap \- graphically view information about SLURM jobs, partitions, and set @@ -26,8 +26,8 @@ views and displaying a corresponding node chart. While in any display a user can switch by typing a different view letter. This is true in all modes except for 'configure mode' user can type 'quit' to exit just configure mode. Typing 'exit' will end the configuration mode and exit smap. -Note that unallocated nodes are indicated by a '.' and DOWN or DRAINED -nodes by a '#'. +Note that unallocated nodes are indicated by a '.' and nodes in the +DOWN, DRAINED or FAIL state by a '#'. .RS .TP 15 .I "j" @@ -126,11 +126,13 @@ F (failed), TO (timeout), and NF (node failure). See \fBJOB STATE CODES\fR section below for more information. .TP \fBSTATE\fR -State of the nodes. Possible states include: down, unknown, -idle, allocated, drained, draining, completing and their -abbreviated forms: down, unk, idle, alloc, drain, drng, and -comp respectively. Note that the suffix "*" identifies nodes -that are presently not responding. +State of the nodes. +Possible states include: allocated, completing, down, +drained, draining, fail, failing, idle, and unknown plus +their abbreviated forms: alloc, comp, donw, drain, drng, +fail, failg, idle, and unk respectively. +Note that the suffix "*" identifies nodes that are presently +not responding. See \fBNODE STATE CODES\fR section below for more information. .TP \fBTIMELIMIT\fR @@ -326,7 +328,9 @@ If the node state code is followed by "*", this indicates the node is presently not responding and will not be allocated any new work. If the node remains non\-responsive, it will be placed in the \fBDOWN\fR state (except in the case of -\fBDRAINED\fR, \fBDRAINING\fR, or \fBCOMPLETING\fR nodes). +\fBCOMPLETING\fR, \fBDRAINED\fR, \fBDRAINING\fR, +\fBFAIL\fR, \fBFAILING\fR nodes). + If the node state code is followed by "~", this indicates the node is presently in a power saving mode (typically running at reduced frequency). @@ -369,6 +373,18 @@ this state per system administrator request. See the \fBupdate node\fR command in the \fBscontrol\fR(1) man page or the \fBslurm.conf\fR(5) man page for more information. .TP +\fBFAIL\fR +The node is expected to fail soon and is unavailable for +use per system administrator request. +See the \fBupdate node\fR command in the \fBscontrol\fR(1) +man page or the \fBslurm.conf\fR(5) man page for more information. +.TP +\fBFAILING\fR +The node is currently executing a job, but is expected to fail +soon and is unavailable for use per system administrator request. +See the \fBupdate node\fR command in the \fBscontrol\fR(1) +man page or the \fBslurm.conf\fR(5) man page for more information. +.TP \fBIDLE\fR The node is not allocated to any jobs and is available for use. .TP diff --git a/doc/man/man1/strigger.1 b/doc/man/man1/strigger.1 index 8494fbe7944..274afc84652 100644 --- a/doc/man/man1/strigger.1 +++ b/doc/man/man1/strigger.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "April 2007" "strigger 1.2" "Slurm components" +.TH SCONTROL "1" "May 2007" "strigger 1.2" "Slurm components" .SH "NAME" strigger \- Used set, get or clear Slurm trigger information. @@ -45,6 +45,10 @@ be cleared. \fB\-d\fR, \fB\-\-down\fR Trigger an event if the specified node goes into a DOWN state. +.TP +\fB\-F\fR, \fB\-\-fail\fR +Trigger an event if the specified node goes into a FAILING state. + .TP \fB\-f\fR, \fB\-\-fini\fR Trigger an event when the specified job completes execution. diff --git a/doc/man/man3/slurm_reconfigure.3 b/doc/man/man3/slurm_reconfigure.3 index 21856df4b18..b82406cd9f8 100644 --- a/doc/man/man3/slurm_reconfigure.3 +++ b/doc/man/man3/slurm_reconfigure.3 @@ -1,4 +1,4 @@ -.TH "Slurm API" "3" "October 2005" "Morris Jette" "Slurm administrative calls" +.TH "Slurm API" "3" "May 2007" "Morris Jette" "Slurm administrative calls" .SH "NAME" slurm_delete_partition, slurm_init_part_desc_msg, slurm_reconfigure, slurm_shutdown, slurm_update_job, @@ -92,8 +92,8 @@ prior to setting values of the parameters to be changed. Note: values to zero. This function may only be successfully executed by user root. Note the job priority of zero represents a job that will not be scheduled. Slurm uses the priority one to represent jobs that can not be scheduled until -additional nodes are returned to service (i.e. not DOWN or DRAINED). This -permits lower priority jobs to utilize those resources which are available. +additional nodes are returned to service (i.e. not DOWN, DRAINED, or FAILED). +This permits lower priority jobs to utilize those resources which are available. .LP \fBslurm_update_node\fR Request that the state of one or more nodes be updated. Note that the state of a node (e.g. DRAINING, IDLE, etc.) may be changed, but @@ -101,7 +101,7 @@ its hardware configuration may not be changed by this function. If the hardware configuration of a node changes, update the Slurm configuration file and execute the \fBslurm_reconfigure\fR function. This function may only be successfully executed by user root. If used by some autonomous program, the state value -most likely to be used is \fBNODE_STATE_DRAIN\fR. +most likely to be used is \fBNODE_STATE_DRAIN\fR or \fBNODE_STATE_FAILING\fR. The node state flag \fBNODE_STATE_NO_RESPOND\fR may be specified without changing the underlying node state. Note that the node's \fBNODE_STATE_NO_RESPOND\fR flag will be cleared as soon as the slurmd @@ -242,7 +242,7 @@ which must be linked to your process for use (e.g. "cc \-lslurm myprog.c"). .SH "COPYING" -Copyright (C) 2002 The Regents of the University of California. +Copyright (C) 2002\-2007 The Regents of the University of California. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). UCRL\-CODE\-226842. .LP diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 56651ff78a8..bd087948872 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "December 2006" "slurm.conf 1.2" "Slurm configuration file" +.TH "slurm.conf" "5" "May 2007" "slurm.conf 1.3" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -777,8 +777,8 @@ node specifications should be place in this file in consecutive order. No single node name may be listed more than once in the configuration file. Use "DownNodes=" to record the state of nodes which are temporarily -in a DOWN or DRAINED state without altering permanent configuration -information. +in a DOWN, DRAINED or FAILING state without altering permanent +configuration information. A job step's tasks are allocated to nodes in order the nodes appear in the configuration file. There is presently no capability within SLURM to arbitarily order a job step's tasks. @@ -880,20 +880,27 @@ Number of logical threads in a single physical core (e.g. "2"). The default value is 1. .TP \fBReason\fR -Identifies the reason for a node being in state "DOWN" or "DRAINED" -or "DRAINING". Use quotes to enclose a reason having more than one -word. +Identifies the reason for a node being in state "DOWN", "DRAINED" +"DRAINING", "FAIL" or "FAILING". +Use quotes to enclose a reason having more than one word. .TP \fBState\fR State of the node with respect to the initiation of user jobs. -Acceptable values are "BUSY", "DOWN", "DRAINED", "DRAINING", "IDLE", -and "UNKNOWN". "BUSY" indicates the node has been allocated work +Acceptable values are "BUSY", "DOWN", "DRAINED", "DRAINING", +"FAIL", "FAILING", "IDLE", and "UNKNOWN". +"BUSY" indicates the node has been allocated work and should not be used in the configuration file. "DOWN" indicates the node failed and is unavailable to be allocated work. "DRAINED" indicates the node was configured unavailable to be allocated work and is presently not performing any work. "DRAINING" indicates the node is unavailable to be allocated new work, but is completing the processing of a job. +"FAIL" indicates the node is expected to fail soon, has +no jobs allocated to it, and will not be allocated +to any new jobs. +"FAILING" indicates the node is expected to fail soon, has +one or more jobs allocated to it, but will not be allocated +to any new jobs. "IDLE" indicates the node available to be allocated work, but has none at present "UNKNOWN" indicates the node's state is undefined, but will be @@ -928,27 +935,34 @@ disk space, higher processor speed, etc. Weight is an integer value with a default value of 1. .LP The "DownNodes=" configuration permits you to mark certain nodes as in a -DOWN or DRAINED state without altering the permanent configuration -information listed under a "NodeName=" specification. +DOWN, DRAINED, FAIL, or FAILING state without altering the permanent +configuration information listed under a "NodeName=" specification. .TP \fBDownNodes\fR Any node name, or list of node names, from the "NodeName=" specifications. .TP \fBReason\fR -Identifies the reason for a node being in state "DOWN" or "DRAINED" -or "DRAINING". Use quotes to enclose a reason having more than one -word. +Identifies the reason for a node being in state "DOWN", "DRAINED", +"DRAINING", "FAIL" or "FAILING. +\Use quotes to enclose a reason having more than one word. .TP \fBState\fR State of the node with respect to the initiation of user jobs. -Acceptable values are "BUSY", "DOWN", "DRAINED", "DRAINING", "IDLE", -and "UNKNOWN". "BUSY" indicates the node has been allocated work +Acceptable values are "BUSY", "DOWN", "DRAINED", "DRAINING", "FAIL", +"FAILING, "IDLE", and "UNKNOWN". +"BUSY" indicates the node has been allocated work and should not be used in the configuration file. "DOWN" indicates the node failed and is unavailable to be allocated work. "DRAINED" indicates the node was configured unavailable to be allocated work and is presently not performing any work. "DRAINING" indicates the node is unavailable to be allocated new work, but is completing the processing of a job. +"FAIL" indicates the node is expected to fail soon, has +no jobs allocated to it, and will not be allocated +to any new jobs. +"FAILING" indicates the node is expected to fail soon, has +one or more jobs allocated to it, but will not be allocated +to any new jobs. "IDLE" indicates the node available to be allocated work, but has none at present "UNKNOWN" indicates the node's state is undefined, but will be @@ -1203,7 +1217,7 @@ PartitionName=batch Nodes=dev[9\-17] MinNodes=4 PartitionName=long Nodes=dev[9\-17] MaxTime=120 AllowGroups=admin .SH "COPYING" -Copyright (C) 2002\-2006 The Regents of the University of California. +Copyright (C) 2002\-2007 The Regents of the University of California. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). UCRL\-CODE\-226842. .LP diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index a3fa7cc9054..3c8a2be0dd0 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -369,14 +369,16 @@ enum node_states { }; #define NODE_STATE_BASE 0x00ff #define NODE_STATE_FLAGS 0xff00 -#define NODE_RESUME 0x0100 /* Restore a DRAINED, DRAINING, or - * DOWN node to service (e.g. IDLE or - * ALLOCATED). Used in +#define NODE_RESUME 0x0100 /* Restore a DRAINED, DRAINING, DOWN + * or FAILING node to service (e.g. + * IDLE or ALLOCATED). Used in * slurm_update_node() request */ -#define NODE_STATE_DRAIN 0x0200 /* node not be be allocated work */ +#define NODE_STATE_DRAIN 0x0200 /* node do not new allocated work */ #define NODE_STATE_COMPLETING 0x0400 /* node is completing allocated job */ #define NODE_STATE_NO_RESPOND 0x0800 /* node is not responding */ #define NODE_STATE_POWER_SAVE 0x1000 /* node in power save mode */ +#define NODE_STATE_FAIL 0x2000 /* node is failing, do not allocate + * new work */ /* used to define the size of the credential.signature size * used to define the key size of the io_stream_header_t @@ -943,15 +945,16 @@ typedef struct slurm_step_ctx_struct *slurm_step_ctx; #define TRIGGER_RES_TYPE_NODE 2 #define TRIGGER_TYPE_UP 0x01 #define TRIGGER_TYPE_DOWN 0x02 -#define TRIGGER_TYPE_TIME 0x04 -#define TRIGGER_TYPE_FINI 0x08 -#define TRIGGER_TYPE_RECONFIG 0x10 -#define TRIGGER_TYPE_BLOCK_ERR 0x20 -#define TRIGGER_TYPE_IDLE 0x40 +#define TRIGGER_TYPE_FAIL 0x04 +#define TRIGGER_TYPE_TIME 0x08 +#define TRIGGER_TYPE_FINI 0x10 +#define TRIGGER_TYPE_RECONFIG 0x20 +#define TRIGGER_TYPE_BLOCK_ERR 0x40 +#define TRIGGER_TYPE_IDLE 0x80 typedef struct trigger_info { uint32_t trig_id; /* trigger ID */ - uint8_t res_type; /* TRIGGER_RES_TYPE_* */ + uint16_t res_type; /* TRIGGER_RES_TYPE_* */ char * res_id; /* resource ID */ uint16_t trig_type; /* TRIGGER_TYPE_* */ uint16_t offset; /* seconds from trigger, 0x8000 origin */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 707eab8dc3e..e3fb0da01a9 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -633,8 +633,9 @@ char *job_state_string_compact(enum job_states inx) char *node_state_string(enum node_states inx) { - bool drain_flag = (inx & NODE_STATE_DRAIN); bool comp_flag = (inx & NODE_STATE_COMPLETING); + bool drain_flag = (inx & NODE_STATE_DRAIN); + bool fail_flag = (inx & NODE_STATE_FAIL); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); bool power_flag = (inx & NODE_STATE_POWER_SAVE); @@ -651,6 +652,17 @@ char *node_state_string(enum node_states inx) return "DRAINED"; } } + if (fail_flag) { + if (comp_flag || (inx == NODE_STATE_ALLOCATED)) { + if (no_resp_flag) + return "FAILING*"; + return "FAILING"; + } else { + if (no_resp_flag) + return "FAIL*"; + return "FAIL"; + } + } if (inx == NODE_STATE_DOWN) { if (no_resp_flag) return "DOWN*"; @@ -685,8 +697,9 @@ char *node_state_string(enum node_states inx) char *node_state_string_compact(enum node_states inx) { - bool drain_flag = (inx & NODE_STATE_DRAIN); bool comp_flag = (inx & NODE_STATE_COMPLETING); + bool drain_flag = (inx & NODE_STATE_DRAIN); + bool fail_flag = (inx & NODE_STATE_FAIL); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); bool power_flag = (inx & NODE_STATE_POWER_SAVE); @@ -703,6 +716,17 @@ char *node_state_string_compact(enum node_states inx) return "DRAIN"; } } + if (fail_flag) { + if (comp_flag || (inx == NODE_STATE_ALLOCATED)) { + if (no_resp_flag) + return "FAILG*"; + return "FAILG"; + } else { + if (no_resp_flag) + return "FAIL*"; + return "FAIL"; + } + } if (inx == NODE_STATE_DOWN) { if (no_resp_flag) return "DOWN*"; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index c2aa4bc3e5d..ec56a1ef42f 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3915,7 +3915,7 @@ static void _pack_trigger_msg(trigger_info_msg_t *msg , Buf buffer) pack32(msg->record_count, buffer); for (i=0; i<msg->record_count; i++) { pack32 (msg->trigger_array[i].trig_id, buffer); - pack8 (msg->trigger_array[i].res_type, buffer); + pack16 (msg->trigger_array[i].res_type, buffer); packstr(msg->trigger_array[i].res_id, buffer); pack16 (msg->trigger_array[i].trig_type, buffer); pack16 (msg->trigger_array[i].offset, buffer); @@ -3935,7 +3935,7 @@ static int _unpack_trigger_msg(trigger_info_msg_t ** msg_ptr , Buf buffer) msg->record_count); for (i=0; i<msg->record_count; i++) { safe_unpack32(&msg->trigger_array[i].trig_id, buffer); - safe_unpack8 (&msg->trigger_array[i].res_type, buffer); + safe_unpack16(&msg->trigger_array[i].res_type, buffer); safe_unpackstr_xmalloc(&msg->trigger_array[i].res_id, &uint16_tmp, buffer); safe_unpack16(&msg->trigger_array[i].trig_type, buffer); diff --git a/src/scontrol/update_node.c b/src/scontrol/update_node.c index fcd34779070..ebe15b925e4 100644 --- a/src/scontrol/update_node.c +++ b/src/scontrol/update_node.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * update_node.c - node update function for scontrol. ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-226842. @@ -103,6 +103,10 @@ scontrol_update_node (int argc, char *argv[]) node_msg.node_state = NODE_STATE_DRAIN; update_cnt++; } + else if (strncasecmp(argv[i], "State=FAIL", 10) == 0) { + node_msg.node_state = NODE_STATE_FAIL; + update_cnt++; + } else if (strncasecmp(argv[i], "State=RES", 9) == 0) { node_msg.node_state = NODE_RESUME; update_cnt++; @@ -121,7 +125,7 @@ scontrol_update_node (int argc, char *argv[]) argv[i]); fprintf (stderr, "Request aborted\n"); fprintf (stderr, "Valid states are: "); - fprintf (stderr, "NoResp DRAIN RESUME "); + fprintf (stderr, "NoResp DRAIN FAIL RESUME "); for (k = 0; k < NODE_STATE_END; k++) { fprintf (stderr, "%s ", node_state_string(k)); @@ -141,8 +145,9 @@ scontrol_update_node (int argc, char *argv[]) } } - if ((node_msg.node_state == NODE_STATE_DRAIN) && - (node_msg.reason == NULL)) { + if (((node_msg.node_state == NODE_STATE_DRAIN) + || (node_msg.node_state == NODE_STATE_FAIL)) + && (node_msg.reason == NULL)) { fprintf (stderr, "You must specify a reason when DRAINING a " "node\nRequest aborted\n"); goto done; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index a570b181ed7..c523aa538b9 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -514,6 +514,9 @@ extern int load_all_node_state ( bool state_only ) if (node_state & NODE_STATE_DRAIN) node_ptr->node_state = NODE_STATE_DRAIN; + else if (node_state & NODE_STATE_FAIL) + node_ptr->node_state = + NODE_STATE_FAIL; else if (base_state == NODE_STATE_DOWN) node_ptr->node_state = NODE_STATE_DOWN; } @@ -1031,6 +1034,7 @@ int update_node ( update_node_msg_t * update_node_msg ) if (state_val != (uint16_t) NO_VAL) { if (state_val == NODE_RESUME) { node_ptr->node_state &= (~NODE_STATE_DRAIN); + node_ptr->node_state &= (~NODE_STATE_FAIL); base_state &= NODE_STATE_BASE; if (base_state == NODE_STATE_DOWN) state_val = NODE_STATE_IDLE; @@ -1045,8 +1049,10 @@ int update_node ( update_node_msg_t * update_node_msg ) false); } else if (state_val == NODE_STATE_IDLE) { - /* assume they want to clear DRAIN flag too */ + /* assume they want to clear DRAIN and + * FAIL flags too */ node_ptr->node_state &= (~NODE_STATE_DRAIN); + node_ptr->node_state &= (~NODE_STATE_FAIL); bit_set (avail_node_bitmap, node_inx); bit_set (idle_node_bitmap, node_inx); bit_set (up_node_bitmap, node_inx); @@ -1054,7 +1060,8 @@ int update_node ( update_node_msg_t * update_node_msg ) reset_job_priority(); } else if (state_val == NODE_STATE_ALLOCATED) { - if (!(node_ptr->node_state & NODE_STATE_DRAIN)) + if (!(node_ptr->node_state & (NODE_STATE_DRAIN + | NODE_STATE_FAIL))) bit_set (up_node_bitmap, node_inx); bit_set (avail_node_bitmap, node_inx); bit_clear (idle_node_bitmap, node_inx); @@ -1064,6 +1071,12 @@ int update_node ( update_node_msg_t * update_node_msg ) state_val = node_ptr->node_state | NODE_STATE_DRAIN; } + else if (state_val == NODE_STATE_FAIL) { + bit_clear (avail_node_bitmap, node_inx); + state_val = node_ptr->node_state | + NODE_STATE_FAIL; + trigger_node_failing(node_ptr); + } else { info ("Invalid node state specified %d", state_val); @@ -1095,7 +1108,8 @@ int update_node ( update_node_msg_t * update_node_msg ) base_state = node_ptr->node_state & NODE_STATE_BASE; if ((base_state != NODE_STATE_DOWN) - && ((node_ptr->node_state & NODE_STATE_DRAIN) == 0)) + && ((node_ptr->node_state & (NODE_STATE_DRAIN | + NODE_STATE_FAIL)) == 0)) xfree(node_ptr->reason); free (this_node_name); @@ -1319,6 +1333,7 @@ static bool _valid_node_state_change(uint16_t old, uint16_t new) switch (new) { case NODE_STATE_DOWN: case NODE_STATE_DRAIN: + case NODE_STATE_FAIL: return true; break; @@ -1326,7 +1341,8 @@ static bool _valid_node_state_change(uint16_t old, uint16_t new) if (base_state == NODE_STATE_UNKNOWN) return false; if ((base_state == NODE_STATE_DOWN) - || (node_flags & NODE_STATE_DRAIN)) + || (node_flags & NODE_STATE_DRAIN) + || (node_flags & NODE_STATE_FAIL)) return true; break; @@ -1477,7 +1493,7 @@ validate_node_specs (char *node_name, uint16_t cpus, set_node_down(node_name, reason_down); _sync_bitmaps(node_ptr, job_count); } else if (status == ESLURMD_PROLOG_FAILED) { - if ((node_flags & NODE_STATE_DRAIN) == 0) { + if ((node_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL)) == 0) { last_node_update = time (NULL); error ("Prolog failure on node %s, state to DOWN", node_name); @@ -1656,7 +1672,8 @@ extern int validate_nodes_via_front_end(uint32_t job_count, } if (status == ESLURMD_PROLOG_FAILED) { - if (!(node_ptr->node_state & NODE_STATE_DRAIN)) { + if (!(node_ptr->node_state & (NODE_STATE_DRAIN | + NODE_STATE_FAIL))) { updated_job = true; if (prolog_hostlist) (void) hostlist_push_host( @@ -1769,7 +1786,7 @@ static void _sync_bitmaps(struct node_record *node_ptr, int job_count) } base_state = node_ptr->node_state & NODE_STATE_BASE; if ((base_state == NODE_STATE_DOWN) - || (node_ptr->node_state & NODE_STATE_DRAIN)) + || (node_ptr->node_state & (NODE_STATE_DRAIN | NODE_STATE_FAIL))) bit_clear (avail_node_bitmap, node_inx); else bit_set (avail_node_bitmap, node_inx); @@ -1846,7 +1863,7 @@ static void _node_did_resp(struct node_record *node_ptr) bit_set (share_node_bitmap, node_inx); } if ((base_state == NODE_STATE_DOWN) - || (node_flags & NODE_STATE_DRAIN)) + || (node_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL))) bit_clear (avail_node_bitmap, node_inx); else bit_set (avail_node_bitmap, node_inx); @@ -1910,8 +1927,8 @@ static void _node_not_resp (struct node_record *node_ptr, time_t msg_time) } /* - * set_node_down - make the specified node's state DOWN if possible - * (not in a DRAIN state), kill jobs as needed + * set_node_down - make the specified node's state DOWN and + * kill jobs as needed * IN name - name of the node * IN reason - why the node is DOWN */ @@ -2229,7 +2246,7 @@ void make_node_idle(struct node_record *node_ptr, if (base_state == NODE_STATE_DOWN) { debug3("make_node_idle: Node %s being left DOWN", node_ptr->name); - } else if ((node_ptr->node_state & NODE_STATE_DRAIN) && + } else if ((node_flags & (NODE_STATE_DRAIN | NODE_STATE_FAIL)) && (node_ptr->run_job_cnt == 0) && (node_ptr->comp_job_cnt == 0)) { node_ptr->node_state = NODE_STATE_IDLE | node_flags; diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index ada8a93c978..f9d007c6996 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -185,7 +185,7 @@ static int _build_bitmaps(void) base_state = node_record_table_ptr[i].node_state & NODE_STATE_BASE; drain_flag = node_record_table_ptr[i].node_state & - NODE_STATE_DRAIN; + (NODE_STATE_DRAIN | NODE_STATE_FAIL); no_resp_flag = node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND; job_cnt = node_record_table_ptr[i].run_job_cnt + @@ -299,9 +299,12 @@ static int _state_str2int(const char *state_str) break; } } - if ((i >= NODE_STATE_END) - && (strncasecmp("DRAIN", state_str, 5) == 0)) - state_val = NODE_STATE_IDLE | NODE_STATE_DRAIN; + if (i >= NODE_STATE_END) { + if (strncasecmp("DRAIN", state_str, 5) == 0) + state_val = NODE_STATE_IDLE | NODE_STATE_DRAIN; + else if (strncasecmp("FAIL", state_str, 4) == 0) + state_val = NODE_STATE_IDLE | NODE_STATE_FAIL; + } if (state_val == NO_VAL) { error("invalid node state %s", state_str); errno = EINVAL; @@ -673,7 +676,7 @@ static int _build_all_partitionline_info() * state information depending upon value * 0 = use no saved state information * 1 = recover saved job and trigger state, - * node DOWN/DRAIN state and reason information + * node DOWN/DRAIN/FAIL state and reason information * 2 = recover all state saved from last slurmctld shutdown * RET 0 if no error, otherwise an error code * Note: Operates on common variables only diff --git a/src/slurmctld/trigger_mgr.c b/src/slurmctld/trigger_mgr.c index 8fa90cccc78..7b30f9c97c5 100644 --- a/src/slurmctld/trigger_mgr.c +++ b/src/slurmctld/trigger_mgr.c @@ -64,7 +64,7 @@ #define MAX_PROG_TIME 300 /* maximum run time for program */ /* Change TRIGGER_STATE_VERSION value when changing the state save format */ -#define TRIGGER_STATE_VERSION "VER001" +#define TRIGGER_STATE_VERSION "VER002" /* TRIG_IS_JOB_FINI differs from IS_JOB_FINISHED by considering * completing jobs as not really finished */ @@ -75,13 +75,14 @@ List trigger_list; uint32_t next_trigger_id = 1; static pthread_mutex_t trigger_mutex = PTHREAD_MUTEX_INITIALIZER; bitstr_t *trigger_down_nodes_bitmap = NULL; +bitstr_t *trigger_fail_nodes_bitmap = NULL; bitstr_t *trigger_up_nodes_bitmap = NULL; static bool trigger_block_err = false; static bool trigger_node_reconfig = false; typedef struct trig_mgr_info { uint32_t trig_id; /* trigger ID */ - uint8_t res_type; /* TRIGGER_RES_TYPE_* */ + uint16_t res_type; /* TRIGGER_RES_TYPE_* */ char * res_id; /* node name or job_id (string) */ bitstr_t *nodes_bitmap; /* bitmap of requested nodes (if applicable) */ uint32_t job_id; /* job ID (if applicable) */ @@ -104,7 +105,7 @@ void _trig_del(void *x) { } #if _DEBUG -static char *_res_type(uint8_t res_type) +static char *_res_type(uint16_t res_type) { if (res_type == TRIGGER_RES_TYPE_JOB) return "job"; @@ -120,6 +121,8 @@ static char *_trig_type(uint16_t trig_type) return "up"; else if (trig_type == TRIGGER_TYPE_DOWN) return "down"; + else if (trig_type == TRIGGER_TYPE_FAIL) + return "fail"; else if (trig_type == TRIGGER_TYPE_IDLE) return "idle"; else if (trig_type == TRIGGER_TYPE_TIME) @@ -365,6 +368,17 @@ extern void trigger_node_down(struct node_record *node_ptr) slurm_mutex_unlock(&trigger_mutex); } +extern void trigger_node_failing(struct node_record *node_ptr) +{ + int inx = node_ptr - node_record_table_ptr; + + slurm_mutex_lock(&trigger_mutex); + if (trigger_fail_nodes_bitmap == NULL) + trigger_fail_nodes_bitmap = bit_alloc(node_record_count); + bit_set(trigger_fail_nodes_bitmap, inx); + slurm_mutex_unlock(&trigger_mutex); +} + extern void trigger_node_up(struct node_record *node_ptr) { @@ -394,7 +408,7 @@ extern void trigger_block_error(void) static void _dump_trigger_state(trig_mgr_info_t *trig_ptr, Buf buffer) { pack32 (trig_ptr->trig_id, buffer); - pack8 (trig_ptr->res_type, buffer); + pack16 (trig_ptr->res_type, buffer); packstr (trig_ptr->res_id, buffer); /* rebuild nodes_bitmap as needed from res_id */ /* rebuild job_id as needed from res_id */ @@ -414,7 +428,7 @@ static int _load_trigger_state(Buf buffer) trig_ptr = xmalloc(sizeof(trig_mgr_info_t)); safe_unpack32 (&trig_ptr->trig_id, buffer); - safe_unpack8 (&trig_ptr->res_type, buffer); + safe_unpack16 (&trig_ptr->res_type, buffer); safe_unpackstr_xmalloc(&trig_ptr->res_id, &str_len, buffer); /* rebuild nodes_bitmap as needed from res_id */ /* rebuild job_id as needed from res_id */ @@ -678,6 +692,21 @@ static void _trigger_job_event(trig_mgr_info_t *trig_in, time_t now) } } + if (trig_in->trig_type & TRIGGER_TYPE_FAIL) { + if (trigger_fail_nodes_bitmap + && bit_overlap(trig_in->job_ptr->node_bitmap, + trigger_fail_nodes_bitmap)) { +#if _DEBUG + info("trigger[%u] for job %u node fail", + trig_in->trig_id, trig_in->job_id); +#endif + trig_in->state = 1; + trig_in->trig_time = now + + (trig_in->trig_time - 0x8000); + return; + } + } + if (trig_in->trig_type & TRIGGER_TYPE_UP) { if (trigger_up_nodes_bitmap && bit_overlap(trig_in->job_ptr->node_bitmap, @@ -734,6 +763,34 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now) } } + if ((trig_in->trig_type & TRIGGER_TYPE_FAIL) + && trigger_fail_nodes_bitmap + && (bit_ffs(trigger_fail_nodes_bitmap) != -1)) { + if (trig_in->nodes_bitmap == NULL) { /* all nodes */ + xfree(trig_in->res_id); + trig_in->res_id = bitmap2node_name( + trigger_fail_nodes_bitmap); + trig_in->state = 1; + } else if (bit_overlap(trig_in->nodes_bitmap, + trigger_fail_nodes_bitmap)) { + bit_and(trig_in->nodes_bitmap, + trigger_fail_nodes_bitmap); + xfree(trig_in->res_id); + trig_in->res_id = bitmap2node_name( + trig_in->nodes_bitmap); + trig_in->state = 1; + } + if (trig_in->state == 1) { + trig_in->trig_time = now + + (trig_in->trig_time - 0x8000); +#if _DEBUG + info("trigger[%u] for node %s fail", + trig_in->trig_id, trig_in->res_id); +#endif + return; + } + } + if (trig_in->trig_type & TRIGGER_TYPE_IDLE) { /* We need to determine which (if any) of these * nodes have been idle for at least the offset time */ @@ -946,5 +1003,6 @@ extern void trigger_fini(void) trigger_list = NULL; } FREE_NULL_BITMAP(trigger_down_nodes_bitmap); + FREE_NULL_BITMAP(trigger_fail_nodes_bitmap); FREE_NULL_BITMAP(trigger_up_nodes_bitmap); } diff --git a/src/slurmctld/trigger_mgr.h b/src/slurmctld/trigger_mgr.h index 80d922bdead..685b6358b84 100644 --- a/src/slurmctld/trigger_mgr.h +++ b/src/slurmctld/trigger_mgr.h @@ -51,6 +51,7 @@ extern int trigger_set(uid_t uid, gid_t gid, trigger_info_msg_t *msg); /* Note the some event has occured and flag triggers as needed */ extern void trigger_block_error(void); extern void trigger_node_down(struct node_record *node_ptr); +extern void trigger_node_failing(struct node_record *node_ptr); extern void trigger_node_up(struct node_record *node_ptr); extern void trigger_reconfig(void); diff --git a/src/strigger/opts.c b/src/strigger/opts.c index 987d3cfc988..645755027ad 100644 --- a/src/strigger/opts.c +++ b/src/strigger/opts.c @@ -92,6 +92,7 @@ extern void parse_command_line(int argc, char *argv[]) static struct option long_options[] = { {"block_err", no_argument, 0, OPT_LONG_BLOCK_ERR}, {"down", no_argument, 0, 'd'}, + {"fail", no_argument, 0, 'F'}, {"fini", no_argument, 0, 'f'}, {"id", required_argument, 0, 'i'}, {"idle", no_argument, 0, 'I'}, @@ -117,7 +118,7 @@ extern void parse_command_line(int argc, char *argv[]) _init_options(); optind = 0; - while((opt_char = getopt_long(argc, argv, "dfi:Ij:no:p:qrtuvV", + while((opt_char = getopt_long(argc, argv, "dFfi:Ij:no:p:qrtuvV", long_options, &option_index)) != -1) { switch (opt_char) { case (int)'?': @@ -131,6 +132,9 @@ extern void parse_command_line(int argc, char *argv[]) case (int)'d': params.node_down = true; break; + case (int)'F': + params.node_fail = true; + break; case (int)'f': params.job_fini = true; break; @@ -227,6 +231,7 @@ static void _init_options( void ) params.block_err = false; params.node_down = false; + params.node_fail = false; params.node_idle = false; params.trigger_id = 0; params.job_fini = false; @@ -253,6 +258,7 @@ static void _print_options( void ) verbose("job_id = %u", params.job_id); verbose("job_fini = %s", params.job_fini ? "true" : "false"); verbose("node_down = %s", params.node_down ? "true" : "false"); + verbose("node_fail = %s", params.node_fail ? "true" : "false"); verbose("node_idle = %s", params.node_idle ? "true" : "false"); verbose("node_up = %s", params.node_up ? "true" : "false"); verbose("node = %s", params.node_id); @@ -282,7 +288,8 @@ static void _validate_options( void ) } if (params.mode_set - && ((params.node_down + params.node_idle + params.node_up + params.reconfig + + && ((params.node_down + params.node_fail + params.node_idle + params.node_up + + params.reconfig + params.job_fini + params.time_limit + params.block_err) == 0)) { error("You must specify a trigger (--block_err, --down, --up, " "--reconfig, --time or --fini)"); @@ -342,6 +349,7 @@ Usage: strigger [--set | --get | --clear] [OPTIONS]\n\ --clear delete a trigger\n\n\ --block_err trigger event on BlueGene block error\n\ -d, --down trigger event when node goes DOWN\n\ + -F, --fail trigger event when node is expected to FAIL\n\ -f, --fini trigger event when job finishes\n\ -i, --id=# a trigger's ID number\n\ -I, --idle trigger event when node remains IDLE\n\ diff --git a/src/strigger/strigger.c b/src/strigger/strigger.c index c8789e689ce..032468e6ec3 100644 --- a/src/strigger/strigger.c +++ b/src/strigger/strigger.c @@ -147,6 +147,8 @@ static int _set_trigger(void) ti.trig_type |= TRIGGER_TYPE_BLOCK_ERR; if (params.node_down) ti.trig_type |= TRIGGER_TYPE_DOWN; + if (params.node_fail) + ti.trig_type |= TRIGGER_TYPE_FAIL; if (params.node_idle) ti.trig_type |= TRIGGER_TYPE_IDLE; if (params.node_up) @@ -208,6 +210,13 @@ static int _get_trigger(void) != TRIGGER_TYPE_DOWN)) continue; } + if (params.node_fail) { + if ((trig_msg->trigger_array[i].res_type + != TRIGGER_RES_TYPE_NODE) + || (trig_msg->trigger_array[i].trig_type + != TRIGGER_TYPE_FAIL)) + continue; + } if (params.node_id) { if (trig_msg->trigger_array[i].res_type != TRIGGER_RES_TYPE_NODE) @@ -281,6 +290,8 @@ static char *_trig_type(uint16_t trig_type) return "up"; else if (trig_type == TRIGGER_TYPE_DOWN) return "down"; + else if (trig_type == TRIGGER_TYPE_FAIL) + return "fail"; else if (trig_type == TRIGGER_TYPE_IDLE) return "idle"; else if (trig_type == TRIGGER_TYPE_TIME) diff --git a/src/strigger/strigger.h b/src/strigger/strigger.h index 8aa26706a26..b2c0b6f9481 100644 --- a/src/strigger/strigger.h +++ b/src/strigger/strigger.h @@ -55,6 +55,7 @@ struct strigger_parameters { bool mode_clear; bool node_down; char * node_id; + bool node_fail; bool node_idle; bool node_up; int offset; -- GitLab