diff --git a/NEWS b/NEWS index d56d3a31804bbfe695832354c96e510000aecf7b..0611c3bcc9731ca3caf6429998b190e5ea4e67c5 100644 --- a/NEWS +++ b/NEWS @@ -41,6 +41,8 @@ documents those changes that are of interest to users and admins. upon a patch by Phil Ekcert (LLNL). -- BGQ - Added logic to keep track of cnodes in an error state inside of a booted block. + -- Add the ability to update a node's NodeAddr and NodeHostName with scontrol. + Also enable setting a node's state to "future" using scontrol. * Changes in SLURM 2.3.1 ======================== diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 0f90598a55ad719897629da0585c89fd8f46e459..0849fbf698e089afc2634966a8b0659138cff27a 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1981,10 +1981,14 @@ typedef struct submit_response_msg { uint32_t error_code; /* error code for warning message */ } submit_response_msg_t; +/* NOTE: If setting node_addr and/or node_hostname then comma separate names + * and include an equal number of node_names */ typedef struct slurm_update_node_msg { char *features; /* new feature for node */ char *gres; /* new generic resources for node */ - char *node_names; /* comma separated list of required nodes */ + char *node_addr; /* communication name (optional) */ + char *node_hostname; /* node's hostname (optional) */ + char *node_names; /* nodelist expression */ uint16_t node_state; /* see enum node_states */ char *reason; /* reason for node being DOWN or DRAINING */ uint32_t reason_uid; /* user ID of sending (needed if user diff --git a/src/common/hostlist.h b/src/common/hostlist.h index 80d91592e28322049ce119320a74e322852f6059..be6539f4af0b12a9407b3b8caad63c2602c0acb1 100644 --- a/src/common/hostlist.h +++ b/src/common/hostlist.h @@ -64,6 +64,12 @@ #endif #define HIGHEST_BASE 36 +#define FREE_NULL_HOSTLIST(_X) \ + do { \ + if (_X) hostlist_destroy (_X); \ + _X = NULL; \ + } while (0) + extern char *alpha_num; /* Notes: diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index cadcc5e3c3aa354f6fe75e8d88cc209f1bfe183f..84294549ca05d2ffee31e6200532affcc4828184 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -520,6 +520,8 @@ extern void slurm_free_update_node_msg(update_node_msg_t * msg) if (msg) { xfree(msg->features); xfree(msg->gres); + xfree(msg->node_addr); + xfree(msg->node_hostname); xfree(msg->node_names); xfree(msg->reason); xfree(msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index b682bc3a69c57745f4ffa18313611479f3066eed..872e7c82c3b8e70f04d95b4cecb019e9a25fb22f 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2190,7 +2190,17 @@ _pack_update_node_msg(update_node_msg_t * msg, Buf buffer, uint16_t protocol_version) { xassert(msg != NULL); - if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) { + packstr(msg->node_addr, buffer); + packstr(msg->node_hostname, buffer); + packstr(msg->node_names, buffer); + pack16(msg->node_state, buffer); + packstr(msg->features, buffer); + packstr(msg->gres, buffer); + packstr(msg->reason, buffer); + pack32(msg->weight, buffer); + pack32(msg->reason_uid, buffer); + } else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { packstr(msg->node_names, buffer); pack16(msg->node_state, buffer); packstr(msg->features, buffer); @@ -2219,7 +2229,20 @@ _unpack_update_node_msg(update_node_msg_t ** msg, Buf buffer, tmp_ptr = xmalloc(sizeof(update_node_msg_t)); *msg = tmp_ptr; - if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) { + safe_unpackstr_xmalloc(&tmp_ptr->node_addr, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->node_hostname, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->node_names, + &uint32_tmp, buffer); + safe_unpack16(&tmp_ptr->node_state, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->features, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->gres, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->reason, &uint32_tmp, buffer); + safe_unpack32(&tmp_ptr->weight, buffer); + safe_unpack32(&tmp_ptr->reason_uid, buffer); + } else if (protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { safe_unpackstr_xmalloc(&tmp_ptr->node_names, &uint32_tmp, buffer); safe_unpack16(&tmp_ptr->node_state, buffer); diff --git a/src/scontrol/update_node.c b/src/scontrol/update_node.c index 0879b67a72a6a5cef58776795edaac7cc4a86140..cfc16e6bc29b57a82520007eca5c78aca2802d85 100644 --- a/src/scontrol/update_node.c +++ b/src/scontrol/update_node.c @@ -72,9 +72,17 @@ scontrol_update_node (int argc, char *argv[]) error("Invalid input: %s Request aborted", argv[i]); return -1; } - if (strncasecmp(tag, "NodeName", MAX(tag_len, 1)) == 0) + + if (strncasecmp(tag, "NodeAddr", MAX(tag_len, 5)) == 0) { + node_msg.node_addr = val; + update_cnt++; + } else if (strncasecmp(tag, "NodeHostName", MAX(tag_len, 5)) + == 0) { + node_msg.node_hostname = val; + update_cnt++; + } else if (strncasecmp(tag, "NodeName", MAX(tag_len, 1)) == 0) { node_msg.node_names = val; - else if (strncasecmp(tag, "Features", MAX(tag_len, 1)) == 0) { + } else if (strncasecmp(tag, "Features", MAX(tag_len, 1)) == 0) { node_msg.features = val; update_cnt++; } else if (strncasecmp(tag, "Gres", MAX(tag_len, 1)) == 0) { @@ -156,6 +164,10 @@ scontrol_update_node (int argc, char *argv[]) MAX(val_len, 3)) == 0) { node_msg.node_state = NODE_STATE_FAIL; update_cnt++; + } else if (strncasecmp(val, "FUTURE", + MAX(val_len, 3)) == 0) { + node_msg.node_state = NODE_STATE_FUTURE; + update_cnt++; } else if (strncasecmp(val, "RESUME", MAX(val_len, 3)) == 0) { node_msg.node_state = NODE_RESUME; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 3c98473ddfe3f08a4b5ea36743b546c6dd2005a1..04f6b595417154f7d2781d39cb95a4a82b2cf146 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -817,28 +817,64 @@ void set_slurmd_addr (void) */ int update_node ( update_node_msg_t * update_node_msg ) { - int error_code = 0, node_inx; + int error_code = 0, node_cnt, node_inx; struct node_record *node_ptr = NULL; char *this_node_name = NULL; - hostlist_t host_list; - uint16_t base_state = 0, state_val; + hostlist_t host_list, hostaddr_list = NULL, hostname_list = NULL; + uint16_t base_state = 0, node_flags, state_val; time_t now = time(NULL); - if (update_node_msg -> node_names == NULL ) { - error ("update_node: invalid node name %s", + if (update_node_msg->node_names == NULL ) { + info("update_node: invalid node name %s", update_node_msg -> node_names ); return ESLURM_INVALID_NODE_NAME; } - if ( (host_list = hostlist_create (update_node_msg -> node_names)) - == NULL) { - error ("hostlist_create error on %s: %m", - update_node_msg -> node_names); + host_list = hostlist_create(update_node_msg->node_names); + if (host_list == NULL) { + info("update_node: hostlist_create error on %s: %m", + update_node_msg->node_names); return ESLURM_INVALID_NODE_NAME; } + node_cnt = hostlist_count(host_list); + + if (update_node_msg->node_addr) { + hostaddr_list = hostlist_create(update_node_msg->node_addr); + if (hostaddr_list == NULL) { + info("update_node: hostlist_create error on %s: %m", + update_node_msg->node_addr); + FREE_NULL_HOSTLIST(host_list); + return ESLURM_INVALID_NODE_NAME; + } + if (node_cnt != hostlist_count(hostaddr_list)) { + info("update_node: nodecount mismatch"); + FREE_NULL_HOSTLIST(host_list); + FREE_NULL_HOSTLIST(hostaddr_list); + return ESLURM_INVALID_NODE_NAME; + } + } + + if (update_node_msg->node_hostname) { + hostname_list = hostlist_create(update_node_msg->node_hostname); + if (hostname_list == NULL) { + info("update_node: hostlist_create error on %s: %m", + update_node_msg->node_hostname); + FREE_NULL_HOSTLIST(host_list); + FREE_NULL_HOSTLIST(hostaddr_list); + return ESLURM_INVALID_NODE_NAME; + } + if (node_cnt != hostlist_count(hostname_list)) { + info("update_node: nodecount mismatch"); + FREE_NULL_HOSTLIST(host_list); + FREE_NULL_HOSTLIST(hostaddr_list); + FREE_NULL_HOSTLIST(hostname_list); + return ESLURM_INVALID_NODE_NAME; + } + } while ( (this_node_name = hostlist_shift (host_list)) ) { int err_code = 0; + state_val = update_node_msg->node_state; node_ptr = find_node_record (this_node_name); node_inx = node_ptr - node_record_table_ptr; @@ -850,6 +886,19 @@ int update_node ( update_node_msg_t * update_node_msg ) break; } + if (hostaddr_list) { + char *this_addr = hostlist_shift(hostaddr_list); + xfree(node_ptr->comm_name); + node_ptr->comm_name = xstrdup(this_addr); + free(this_addr); + } + if (hostname_list) { + char *this_hostname = hostlist_shift(hostname_list); + xfree(node_ptr->node_hostname); + node_ptr->node_hostname = xstrdup(this_hostname); + free(this_hostname); + } + if (update_node_msg->features) { xfree(node_ptr->features); if (update_node_msg->features[0]) @@ -887,6 +936,7 @@ int update_node ( update_node_msg_t * update_node_msg ) error_code = ESLURM_INVALID_NODE_STATE; } base_state &= NODE_STATE_BASE; + node_flags = node_ptr->node_state & NODE_STATE_FLAGS; } if (state_val != (uint16_t) NO_VAL) { if (state_val == NODE_RESUME) { @@ -929,11 +979,16 @@ int update_node ( update_node_msg_t * update_node_msg ) } else state_val = base_state; } - if (state_val == NODE_STATE_DOWN) { + if ((state_val == NODE_STATE_DOWN) || + (state_val == NODE_STATE_FUTURE)) { /* We must set node DOWN before killing * its jobs */ _make_node_down(node_ptr, now); kill_running_job_by_node_name (this_node_name); + if (state_val == NODE_STATE_FUTURE) { + node_ptr->node_state = NODE_STATE_FUTURE + | node_flags; + } } else if (state_val == NODE_STATE_IDLE) { /* assume they want to clear DRAIN and * FAIL flags too */ @@ -1033,7 +1088,9 @@ int update_node ( update_node_msg_t * update_node_msg ) free (this_node_name); } - hostlist_destroy (host_list); + FREE_NULL_HOSTLIST(host_list); + FREE_NULL_HOSTLIST(hostaddr_list); + FREE_NULL_HOSTLIST(hostname_list); last_node_update = now; if ((error_code == 0) && (update_node_msg->features)) { @@ -1485,6 +1542,12 @@ static bool _valid_node_state_change(uint16_t old, uint16_t new) return true; break; + case NODE_STATE_FUTURE: + if ((base_state == NODE_STATE_DOWN) || + (base_state == NODE_STATE_IDLE)) + return true; + break; + case NODE_STATE_IDLE: if ((base_state == NODE_STATE_DOWN) || (base_state == NODE_STATE_IDLE))