diff --git a/NEWS b/NEWS index fa4797d77b916eb0e7ca41464e5e89339633138f..0d9e2d2f00a382fbc34421679843c833ad1bdef4 100644 --- a/NEWS +++ b/NEWS @@ -7,7 +7,7 @@ documents those changes that are of interest to users and admins. -- Notify srun to retry step creation upon completion of other job steps rather than polling. This results in much faster throughput for job step execution with --exclusive option. - -- Added ResvEpilog and ResvProlog configuration parameters to execute a + -- Added "ResvEpilog" and "ResvProlog" configuration parameters to execute a program at the beginning and end of each reservation. -- Added "slurm_load_job_user" function. This is a variation of "slurm_load_jobs", but accepts a user ID argument, potentially resulting @@ -15,6 +15,8 @@ documents those changes that are of interest to users and admins. -- Added "slurm_load_node_single" function. This is a variation of "slurm_load_nodes", but accepts a node name argument, potentially resulting in substantial performance improvement for "sinfo --nodes=NAME". + -- Added "HealthCheckNodeState" configuration parameter identify node states + on which HealthCheckProgram should be executed. * Changes in SLURM 2.5.1 ======================== @@ -37,6 +39,11 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - Correct method to update conn_type of a job. -- BLUEGENE - Fix issue with preemption when needing to preempt multiple jobs to make one job run. + -- Fixed issue where if an srun dies inside of an allocation abnormally it + would of also killed the allocation. + -- FRONTEND - fixed issue where if a systems nodes weren't defined in the + slurm.conf with NodeAddr's signals going to a step could be handled + incorrectly. * Changes in SLURM 2.5.0 ======================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index b0f9572e5508665de9cb1ffa28511b2e37b636f4..30b1522e94ebf833d6c63f5324b04082b6e3031f 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -28,8 +28,10 @@ HIGHLIGHTS CONFIGURATION FILE CHANGES (see "man slurm.conf" for details) ============================================================= - - Added ResvEpilog and ResvProlog configuration parameters to execute a + - Added "ResvEpilog" and "ResvProlog" configuration parameters to execute a program at the beginning and end of a reservation. + - Added "HealthCheckNodeState" configuration parameter identify node states + on which HealthCheckProgram should be executed. COMMAND CHANGES (see man pages for details) =========================================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 77bc0bb76f9f9032ea5e58942ec8649c057f15d1..876fa2b9ec8a7a4214a78c959f56325e3b6b614e 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -511,7 +511,7 @@ upon termination of a job allocation (e.g. The program executes as SlurmUser, which gives it permission to drain nodes and requeue the job if a failure occurs or cancel the job if appropriate. Exactly what the program does and how it accomplishes this is completely at -the discression of the system administrator. +the discretion of the system administrator. Information about the job being initiated, it's allocated nodes, etc. are passed to the program using environment variables. See \fBProlog and Epilog Scripts\fR for more information. @@ -603,6 +603,26 @@ Also see the \fBGroupUpdateForce\fR parameter. The interval in seconds between executions of \fBHealthCheckProgram\fR. The default value is zero, which disables execution. +.TP +\fBHealthCheckNodeState\fR +Identify what node states should execute the \fBHealthCheckProgram\fR. +Multiple state values may be specified with a comma separator. +The default value is ANY to execute on nodes in any state. +.RS +.TP 12 +\fBALLOC\fR +Run on nodes in the ALLOC state (all CPUs allocated). +.TP +\fBANY\fR +Run on nodes in any state. +.TP +\fBIDLE\fR +Run on nodes in the IDLE state. +.TP +\fBMIXED\fR +Run on nodes in the MIXED state (some CPUs idle and other CPUs allocated). +.RE + .TP \fBHealthCheckProgram\fR Fully qualified pathname of a script to execute as user root periodically @@ -1258,7 +1278,7 @@ nodes and requeue the job if a failure occurs or cancel the job if appropriate. The program can be used to reboot nodes or perform other work to prepare resources for use. Exactly what the program does and how it accomplishes this is completely at -the discression of the system administrator. +the discretion of the system administrator. Information about the job being initiated, it's allocated nodes, etc. are passed to the program using environment variables. While this program is running, the nodes associated with the job will be diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 2e768c4c276db61b539fdcf2c113f20b572081f7..4087d317e43a6cbf2d012c79a0d07eb2db58b932 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1845,6 +1845,12 @@ typedef struct reservation_name_msg { #define RECONFIG_KEEP_PART_INFO 0x0001 /* keep dynamic partition info on scontrol reconfig */ #define RECONFIG_KEEP_PART_STAT 0x0002 /* keep dynamic partition state on scontrol reconfig */ +#define HEALTH_CHECK_NODE_IDLE 0x0001 /* execute on idle nodes */ +#define HEALTH_CHECK_NODE_ALLOC 0x0002 /* execute on fully allocated nodes */ +#define HEALTH_CHECK_NODE_MIXED 0x0004 /* execute on partially allocated nodes */ +#define HEALTH_CHECK_NODE_ANY 0xffff /* execute on all node states */ + + typedef struct slurm_ctl_conf { time_t last_update; /* last update time of the build parameters */ uint16_t accounting_storage_enforce; /* job requires valid association: @@ -1891,6 +1897,9 @@ typedef struct slurm_ctl_conf { uint16_t group_info; /* see GROUP_* fields above */ uint32_t hash_val; /* Hash value of the slurm.conf file */ uint16_t health_check_interval; /* secs between health checks */ + uint16_t health_check_node_state; /* Node states on which to execute + * health check program, see + * HEALTH_CHECK_NODE_* above */ char * health_check_program; /* pathname of health check program */ uint16_t inactive_limit;/* seconds of inactivity before a * inactive resource allocation is released */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 0ad074e5e2314afa7dea00e06a0bb7d990423b4c..6def95806409416d2347168773ba95c8e5c671bb 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -393,6 +393,12 @@ extern void *slurm_ctl_conf_2_key_pairs (slurm_ctl_conf_t* slurm_ctl_conf_ptr) key_pair->value = xstrdup(tmp_str); list_append(ret_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("HealthCheckNodeState"); + key_pair->value = health_check_node_state_str(slurm_ctl_conf_ptr-> + health_check_node_state); + list_append(ret_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("HealthCheckProgram"); key_pair->value = xstrdup(slurm_ctl_conf_ptr->health_check_program); diff --git a/src/common/node_select.c b/src/common/node_select.c index bb90d7d5c79a55721b4928377a729d8ea5fa40a0..6869b6dede37e101aa4e3d42ac2c7435ee7e598f 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -870,7 +870,7 @@ extern int select_g_select_nodeinfo_get(dynamic_plugin_data_t *nodeinfo, if (slurm_select_init(0) < 0) return SLURM_ERROR; - if(nodeinfo) { + if (nodeinfo) { nodedata = nodeinfo->data; plugin_id = nodeinfo->plugin_id; } else diff --git a/src/common/read_config.c b/src/common/read_config.c index 1af4026acd1cfa263346fb88e85157881ec3fbf2..75ae3c2eb7e94fb43c3b115175108b60eadd9b29 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -193,6 +193,7 @@ s_p_options_t slurm_conf_options[] = { {"GroupUpdateForce", S_P_UINT16}, {"GroupUpdateTime", S_P_UINT16}, {"HealthCheckInterval", S_P_UINT16}, + {"HealthCheckNodeState", S_P_STRING}, {"HealthCheckProgram", S_P_STRING}, {"InactiveLimit", S_P_UINT16}, {"JobAcctGatherType", S_P_STRING}, @@ -2478,6 +2479,34 @@ static void _normalize_debug_level(uint16_t *level) /* level is uint16, always > LOG_LEVEL_QUIET(0), can't underflow */ } +/* Convert HealthCheckNodeState string to numeric value */ +static uint16_t _health_node_state(char *state_str) +{ + uint16_t state_num = 0; + char *tmp_str = xstrdup(state_str); + char *token, *last = NULL; + + token = strtok_r(tmp_str, ",", &last); + while (token) { + if (!strcasecmp(token, "ANY")) + state_num |= HEALTH_CHECK_NODE_ANY; + else if (!strcasecmp(token, "ALLOC")) + state_num |= HEALTH_CHECK_NODE_ALLOC; + else if (!strcasecmp(token, "IDLE")) + state_num |= HEALTH_CHECK_NODE_IDLE; + else if (!strcasecmp(token, "MIXED")) + state_num |= HEALTH_CHECK_NODE_MIXED; + else { + error("Invalid HealthCheckNodeState value %s ignored", + token); + } + token = strtok_r(NULL, ",", &last); + } + xfree(tmp_str); + + return state_num; +} + /* * * IN/OUT ctl_conf_ptr - a configuration as loaded by read_slurm_conf_ctl @@ -2762,6 +2791,12 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) s_p_get_uint16(&conf->health_check_interval, "HealthCheckInterval", hashtbl); + if (s_p_get_string(&temp_str, "HealthCheckNodeState", hashtbl)) { + conf->health_check_node_state = _health_node_state(temp_str); + xfree(temp_str); + } else + conf->health_check_node_state = HEALTH_CHECK_NODE_ANY; + s_p_get_string(&conf->health_check_program, "HealthCheckProgram", hashtbl); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 68500c4eca7fa9de228214a8357daa25b4f66adb..d09e4f88975b68a9cf48308ddeb0c199a6cde4cf 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1315,6 +1315,34 @@ extern char *trigger_res_type(uint16_t res_type) return "unknown"; } +/* Convert HealthCheckNodeState numeric value to a string. + * Caller must xfree() the return value */ +extern char *health_check_node_state_str(uint16_t node_state) +{ + char *state_str = NULL; + + if (node_state == HEALTH_CHECK_NODE_ANY) { + state_str = xstrdup("ANY"); + return state_str; + } + + state_str = xstrdup(""); + if (node_state & HEALTH_CHECK_NODE_IDLE) + xstrcat(state_str, "IDLE"); + if (node_state & HEALTH_CHECK_NODE_ALLOC) { + if (state_str[0]) + xstrcat(state_str, ","); + xstrcat(state_str, "ALLOC"); + } + if (node_state & HEALTH_CHECK_NODE_MIXED) { + if (state_str[0]) + xstrcat(state_str, ","); + xstrcat(state_str, "MIXED"); + } + + return state_str; +} + extern char *trigger_type(uint32_t trig_type) { if (trig_type == TRIGGER_TYPE_UP) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index a6b59a5063750e28354e1dac53b6690aba7a1714..8f4a491310f66e51c2ba7768ee0030bb4828b86a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -1176,6 +1176,10 @@ extern uint16_t preempt_mode_num(const char *preempt_mode); extern char *log_num2string(uint16_t inx); extern uint16_t log_string2num(char *name); +/* Convert HealthCheckNodeState numeric value to a string. + * Caller must xfree() the return value */ +extern char *health_check_node_state_str(uint16_t node_state); + extern char *sched_param_type_string(uint16_t select_type_param); extern char *job_reason_string(enum job_state_reason inx); extern char *job_state_string(uint16_t inx); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index e4234a1640b367afbf8ffbb559d37132efd2d98a..2dcf7e60f832fbba741edd04eb0e6e130eb953dc 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -4578,6 +4578,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer, pack32(build_ptr->hash_val, buffer); pack16(build_ptr->health_check_interval, buffer); + pack16(build_ptr->health_check_node_state, buffer); packstr(build_ptr->health_check_program, buffer); pack16(build_ptr->inactive_limit, buffer); @@ -5228,6 +5229,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t **build_buffer_ptr, safe_unpack32(&build_ptr->hash_val, buffer); safe_unpack16(&build_ptr->health_check_interval, buffer); + safe_unpack16(&build_ptr->health_check_node_state, buffer); safe_unpackstr_xmalloc(&build_ptr->health_check_program, &uint32_tmp, buffer); diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c index dd361d61196e5ca2f1a91f4f72576361a0a88314..ae8002810b47622754cc53acfbc77271dc309541 100644 --- a/src/slurmctld/ping_nodes.c +++ b/src/slurmctld/ping_nodes.c @@ -49,6 +49,7 @@ #include <string.h> #include "src/common/hostlist.h" +#include "src/common/node_select.h" #include "src/common/read_config.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/front_end.h" @@ -343,7 +344,7 @@ extern void run_health_check(void) #else struct node_record *node_ptr; #endif - int i; + int i, node_states = slurmctld_conf.health_check_node_state; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; @@ -363,11 +364,43 @@ extern void run_health_check(void) check_agent_args->node_count++; } #else + if ((node_states != HEALTH_CHECK_NODE_ANY) && + (node_states != HEALTH_CHECK_NODE_IDLE)) { + /* Update each node's alloc_cpus count */ + select_g_select_nodeinfo_set_all(); + } + for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; + if (node_states != HEALTH_CHECK_NODE_ANY) { + uint16_t cpus_total, cpus_used = 0; + if (slurmctld_conf.fast_schedule) { + cpus_total = node_ptr->config_ptr->cpus; + } else { + cpus_total = node_ptr->cpus; + } + if (!IS_NODE_IDLE(node_ptr)) { + select_g_select_nodeinfo_get( + node_ptr->select_nodeinfo, + SELECT_NODEDATA_SUBCNT, + NODE_STATE_ALLOCATED, + &cpus_used); + } + if (cpus_used == 0) { + if (!(node_states & HEALTH_CHECK_NODE_IDLE)) + continue; + } else if (cpus_used < cpus_total) { + if (!(node_states & HEALTH_CHECK_NODE_MIXED)) + continue; + } else { + if (!(node_states & HEALTH_CHECK_NODE_ALLOC)) + continue; + } + } + hostlist_push(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 208437c27fad3f4a023b572665eea0e4ebc01c57..6505cd591f2e309329304b0d1caee8677062bc82 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -519,6 +519,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->hash_val = conf->hash_val; conf_ptr->health_check_interval = conf->health_check_interval; + conf_ptr->health_check_node_state = conf->health_check_node_state; conf_ptr->health_check_program = xstrdup(conf->health_check_program); conf_ptr->job_acct_gather_freq = conf->job_acct_gather_freq; diff --git a/src/squeue/opts.c b/src/squeue/opts.c index 67066974b217a9f5549109851b52e2d7e7e09e41..5258ad1a3ef9312381656a1776717a018ab99c1b 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -394,7 +394,28 @@ parse_command_line( int argc, char* argv[] ) } } - params.max_cpus = _max_cpus_per_node(); + if (params.job_list && (list_count(params.job_list) == 1)) { + ListIterator iterator; + uint32_t *job_id_ptr; + iterator = list_iterator_create(params.job_list); + job_id_ptr = list_next(iterator); + params.job_id = *job_id_ptr; + list_iterator_destroy(iterator); + } + if (params.user_list && (list_count(params.user_list) == 1)) { + ListIterator iterator; + uint32_t *uid_ptr; + iterator = list_iterator_create(params.user_list); + while ((uid_ptr = list_next(iterator))) { + params.user_id = *uid_ptr; + break; + } + list_iterator_destroy(iterator); + } + if (params.job_id || params.user_id) + params.max_cpus = 1; /* To minimize overhead */ + else + params.max_cpus = _max_cpus_per_node(); if ( params.verbose ) _print_options(); diff --git a/src/squeue/squeue.c b/src/squeue/squeue.c index 946522995765d57d4faba12116950c9cb43096a9..7f16b5ff518cbdabed05a2d1c3b26ea2bd3df1fa 100644 --- a/src/squeue/squeue.c +++ b/src/squeue/squeue.c @@ -172,7 +172,6 @@ _print_job ( bool clear_old ) static job_info_msg_t * old_job_ptr = NULL, * new_job_ptr; int error_code; uint16_t show_flags = 0; - uint32_t job_id = 0; if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; @@ -181,22 +180,17 @@ _print_job ( bool clear_old ) if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; - if (params.job_list && (list_count(params.job_list) == 1)) { - ListIterator iterator; - uint32_t *job_id_ptr; - iterator = list_iterator_create(params.job_list); - job_id_ptr = list_next(iterator); - job_id = *job_id_ptr; - list_iterator_destroy(iterator); - } - if (old_job_ptr) { if (clear_old) old_job_ptr->last_update = 0; - if (job_id) { + if (params.job_id) { error_code = slurm_load_job( - &new_job_ptr, job_id, + &new_job_ptr, params.job_id, show_flags); + } else if (params.user_id) { + error_code = slurm_load_job_user(&new_job_ptr, + params.user_id, + show_flags); } else { error_code = slurm_load_jobs( old_job_ptr->last_update, @@ -208,19 +202,12 @@ _print_job ( bool clear_old ) error_code = SLURM_SUCCESS; new_job_ptr = old_job_ptr; } - } else if (job_id) { - error_code = slurm_load_job(&new_job_ptr, job_id, show_flags); - } else if (params.user_list && (list_count(params.user_list) == 1)) { - ListIterator iterator; - uint32_t user_id = 0, *uid_ptr; - iterator = list_iterator_create(params.user_list); - while ((uid_ptr = list_next(iterator))) { - user_id = *uid_ptr; - break; - } - list_iterator_destroy(iterator); - error_code = slurm_load_job_user(&new_job_ptr, user_id, - show_flags); + } else if (params.job_id) { + error_code = slurm_load_job(&new_job_ptr, params.job_id, + show_flags); + } else if (params.user_id) { + error_code = slurm_load_job_user(&new_job_ptr, params.user_id, + show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); @@ -231,7 +218,7 @@ _print_job ( bool clear_old ) return SLURM_ERROR; } old_job_ptr = new_job_ptr; - if (job_id) + if (params.job_id || params.job_id) old_job_ptr->last_update = (time_t) 0; if (params.verbose) { diff --git a/src/squeue/squeue.h b/src/squeue/squeue.h index 1df5449f6f86c300c4999a596953db53848146c6..0a6220be2d44af5dc88fec6d4666963ba0b4d057 100644 --- a/src/squeue/squeue.h +++ b/src/squeue/squeue.h @@ -102,6 +102,9 @@ struct squeue_parameters { char* steps; char* users; + uint32_t job_id; /* set if request for a single job ID */ + uint32_t user_id; /* set if request for a single user ID */ + List account_list; List format_list; List job_list; diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index 09a8edebb65f46f50b707372a4c3fd2e1b60ed8f..7d6a5ae508b2424702fe8dfb3bac17a4b52f6eed 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -592,7 +592,7 @@ extern void create_srun_job(srun_job_t **p_job, bool *got_alloc, * Spawn process to insure clean-up of job and/or step * on abnormal termination */ - shepard_fd = _shepard_spawn(job, got_alloc); + shepard_fd = _shepard_spawn(job, *got_alloc); } *p_job = job; @@ -1305,7 +1305,8 @@ static int _shepard_spawn(srun_job_t *job, bool got_alloc) } } - (void) slurm_terminate_job_step(job->jobid, job->stepid); + (void) slurm_kill_job_step(job->jobid, job->stepid, SIGKILL); + if (got_alloc) slurm_complete_job(job->jobid, NO_VAL); exit(0); diff --git a/src/sview/job_info.c b/src/sview/job_info.c index b90554da8421a4e34597853c1b476d8d2017855f..aa8273552af845fdfb4cad9c8ca1451b3e58dc56 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -597,13 +597,8 @@ static int _cancel_step_id(uint32_t job_id, uint32_t step_id, for (i = 0; i < MAX_CANCEL_RETRY; i++) { /* NOTE: RPC always sent to slurmctld rather than directly * to slurmd daemons */ - if (signal == SIGKILL) { - error_code = slurm_terminate_job_step(job_id, step_id); + error_code = slurm_kill_job_step(job_id, step_id, signal); - } else { - error_code = slurm_kill_job_step(job_id, step_id, - signal); - } if (error_code == 0 || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE && errno != ESLURM_JOB_PENDING))