diff --git a/NEWS b/NEWS index 4eeb3f751ca1ebb92d99902f12bc937ae2e4f5b4..0e9f8e3abc54deeb78afdbcd10b9c124bc76b698 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,13 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 2.1.0-pre1 +============================= + -- Slurmd notifies slurmctld of node boot time to better clean up after node + reboots. + -- Slurmd sends node registration information repeatedly until successful + transmit. + * Changes in SLURM 2.0.0 ============================ -- Fix for bluegene systems to be able to create 32 node blocks with only diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 8a35ce116044f6fae1e739836f1628845b2cef85..57b6725fc13f78db367ff3917d0918754d01d7f7 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -245,6 +245,23 @@ uint16_t slurm_get_resume_timeout(void) return resume_timeout; } +/* slurm_get_suspend_time + * RET SuspendTime value from slurm.conf + */ +uint16_t slurm_get_suspend_time(void) +{ + uint16_t suspend_time = 0; + slurm_ctl_conf_t *conf; + + if(slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + suspend_time = conf->suspend_time; + slurm_conf_unlock(); + } + return suspend_time; +} + /* slurm_get_def_mem_per_task * RET DefMemPerTask value from slurm.conf */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 5b7f67b96b49ff75485960f42db1910764e68645..e5c51b6bfef82218ae09019eee12e7cf7e80cfe9 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -108,6 +108,11 @@ uint16_t slurm_get_batch_start_timeout(void); */ uint16_t slurm_get_resume_timeout(void); +/* slurm_get_suspend_time + * RET SuspendTime value from slurm.conf + */ +uint16_t slurm_get_suspend_time(void); + /* slurm_get_complete_wait * RET CompleteWait value from slurm.conf */ diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index ebb731f40ae86c5299da0a647981274808cb896a..e96ed53ecc6f5aab3cd6954a23853646b4f794b8 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -798,6 +798,7 @@ typedef struct slurm_node_registration_status_msg { uint32_t *step_id; /* IDs of running job steps (if any) */ uint32_t status; /* node status code, same as return codes */ uint16_t startup; /* slurmd just restarted */ + uint32_t up_time; /* seconds since reboot */ switch_node_info_t switch_nodeinfo; /* set only if startup != 0 */ } slurm_node_registration_status_msg_t; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index cb240584c1e58ac0f6d640dda29fe5518587804c..7ca80258d933cf9e60d0c38b8d5dad6dd7e52b44 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1679,24 +1679,26 @@ _pack_node_registration_status_msg(slurm_node_registration_status_msg_t * xassert(msg != NULL); pack_time(msg->timestamp, buffer); - pack32((uint32_t)msg->status, buffer); + pack32(msg->status, buffer); packstr(msg->node_name, buffer); packstr(msg->arch, buffer); packstr(msg->os, buffer); - pack16((uint32_t)msg->cpus, buffer); - pack16((uint32_t)msg->sockets, buffer); - pack16((uint32_t)msg->cores, buffer); - pack16((uint32_t)msg->threads, buffer); - pack32((uint32_t)msg->real_memory, buffer); - pack32((uint32_t)msg->tmp_disk, buffer); - pack32((uint32_t)msg->job_count, buffer); + pack16(msg->cpus, buffer); + pack16(msg->sockets, buffer); + pack16(msg->cores, buffer); + pack16(msg->threads, buffer); + pack32(msg->real_memory, buffer); + pack32(msg->tmp_disk, buffer); + pack32(msg->up_time, buffer); + + pack32(msg->job_count, buffer); for (i = 0; i < msg->job_count; i++) { - pack32((uint32_t)msg->job_id[i], buffer); + pack32(msg->job_id[i], buffer); } for (i = 0; i < msg->job_count; i++) { pack32(msg->step_id[i], buffer); } - pack16((uint16_t)msg->startup, buffer); + pack16(msg->startup, buffer); if (msg->startup) switch_g_pack_node_info(msg->switch_nodeinfo, buffer); } @@ -1727,6 +1729,8 @@ _unpack_node_registration_status_msg(slurm_node_registration_status_msg_t safe_unpack16(&node_reg_ptr->threads, buffer); safe_unpack32(&node_reg_ptr->real_memory, buffer); safe_unpack32(&node_reg_ptr->tmp_disk, buffer); + safe_unpack32(&node_reg_ptr->up_time, buffer); + safe_unpack32(&node_reg_ptr->job_count, buffer); node_reg_ptr->job_id = xmalloc(sizeof(uint32_t) * node_reg_ptr->job_count); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 3516beb26c3dfa536ba21f8f390be017ea114d2c..362cd15335c0e24542704a2fa403bcfb511a1fe3 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -140,14 +140,14 @@ static int _list_find_job_old(void *job_entry, void *key); static int _load_job_details(struct job_record *job_ptr, Buf buffer); static int _load_job_state(Buf buffer); static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, - time_t now); + time_t now, time_t node_boot_time); static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer); static void _pack_default_job_details(struct job_details *detail_ptr, Buf buffer); static void _pack_pending_job_details(struct job_details *detail_ptr, Buf buffer); static int _purge_job_record(uint32_t job_id); -static void _purge_lost_batch_jobs(int node_inx, time_t now); +static void _purge_missing_jobs(int node_inx, time_t now); static void _read_data_array_from_file(char *file_name, char ***data, uint32_t * size); static void _read_data_from_file(char *file_name, char **data); @@ -5336,6 +5336,13 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) reg_msg->node_name); return; } + + if (node_ptr->up_time > reg_msg->up_time) { + verbose("Node %s rebooted %u secs ago", + reg_msg->node_name, reg_msg->up_time); + } + node_ptr->up_time = reg_msg->up_time; + node_inx = node_ptr - node_record_table_ptr; /* Check that jobs running are really supposed to be there */ @@ -5415,7 +5422,7 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; if (jobs_on_node) - _purge_lost_batch_jobs(node_inx, now); + _purge_missing_jobs(node_inx, now); if (jobs_on_node != reg_msg->job_count) { /* slurmd will not know of a job unless the job has @@ -5431,42 +5438,64 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) } /* Purge any batch job that should have its script running on node - * node_inx, but is not. Allow "batch_start_timeout" secs for startup. + * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds + * for startup. + * + * Purge all job steps that were started before the node was last booted. * * Also notify srun if any job steps should be active on this node * but are not found. */ -static void _purge_lost_batch_jobs(int node_inx, time_t now) +static void _purge_missing_jobs(int node_inx, time_t now) { ListIterator job_iterator; struct job_record *job_ptr; - uint16_t batch_start_timeout = slurm_get_batch_start_timeout() + - slurm_get_resume_timeout(); - time_t recent = now - batch_start_timeout; + struct node_record *node_ptr = node_record_table_ptr + node_inx; + uint16_t batch_start_timeout = slurm_get_batch_start_timeout(); + uint16_t msg_timeout = slurm_get_msg_timeout(); + uint16_t resume_timeout = slurm_get_resume_timeout(); + uint16_t suspend_time = slurm_get_suspend_time(); + time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time; + + if (node_ptr->up_time) { + node_boot_time = now - node_ptr->up_time; + node_boot_time -= msg_timeout; + node_boot_time -= 5; /* allow for other delays */ + } + batch_startup_time = now - batch_start_timeout; + batch_startup_time -= msg_timeout; job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { bool job_active = ((job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state == JOB_SUSPENDED)); + if ((!job_active) || (!bit_test(job_ptr->node_bitmap, node_inx))) continue; - if (job_ptr->batch_flag == 0) { - _notify_srun_missing_step(job_ptr, node_inx, now); - continue; + if ((job_ptr->batch_flag != 0) && + (suspend_time != 0) /* power mgmt on */ && + (job_ptr->start_time < node_boot_time)) { + startup_time = batch_startup_time - resume_timeout; + } else + startup_time = batch_startup_time; + + if ((job_ptr->batch_flag != 0) && + (job_ptr->time_last_active < startup_time) && + (job_ptr->start_time < startup_time) && + (node_inx == bit_ffs(job_ptr->node_bitmap))) { + info("Batch JobId=%u missing from node 0, killing it", + job_ptr->job_id); + job_complete(job_ptr->job_id, 0, false, NO_VAL); + } else { + _notify_srun_missing_step(job_ptr, node_inx, + now, node_boot_time); } - if (((job_ptr->time_last_active+batch_start_timeout) > now) || - (job_ptr->start_time >= recent) || - (node_inx != bit_ffs(job_ptr->node_bitmap))) - continue; - info("Batch JobId=%u missing from master node, killing it", - job_ptr->job_id); - job_complete(job_ptr->job_id, 0, false, NO_VAL); } list_iterator_destroy(job_iterator); } static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, - time_t now) + time_t now, time_t node_boot_time) { ListIterator step_iterator; struct step_record *step_ptr; @@ -5475,6 +5504,8 @@ static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, xassert(job_ptr); step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { + if (!bit_test(step_ptr->step_node_bitmap, node_inx)) + continue; if (step_ptr->time_last_active >= now) { /* Back up timer in case more than one node * registration happens at this same time. @@ -5482,7 +5513,18 @@ static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, * to count toward a different node's * registration message. */ step_ptr->time_last_active = now - 1; + } else if ((step_ptr->start_time < node_boot_time) && + (step_ptr->no_kill == 0)) { + /* There is a risk that the job step's tasks completed + * on this node before its reboot, but that should be + * very rare */ + info("Node %s rebooted, killing missing step %u.%u", + node_name, job_ptr->job_id, step_ptr->step_id); + srun_step_complete(step_ptr); + signal_step_tasks(step_ptr, SIGKILL); } else if (step_ptr->host && step_ptr->port) { + /* srun may be able to verify step exists on + * this node using I/O sockets */ srun_step_missing(step_ptr, node_name); } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 8cc32ee7a785644ddba102d55c13a4053810ba9a..267230b7bf8b0c9533c4a4fc94ef8dcb73ad85e9 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -213,6 +213,7 @@ struct node_record { uint16_t threads; /* number of threads per core */ uint32_t real_memory; /* MB real memory on the node */ uint32_t tmp_disk; /* MB total disk in TMP_FS */ + uint32_t up_time; /* seconds since node boot */ struct config_record *config_ptr; /* configuration spec ptr */ uint16_t part_cnt; /* number of associated partitions */ struct part_record **part_pptr; /* array of pointers to partitions diff --git a/src/slurmd/slurmd/get_mach_stat.c b/src/slurmd/slurmd/get_mach_stat.c index c5c83f145a64d67b9c61242eb42ace34a8e6b906..f4de42301e2f351f21e92780ad57870f31c79d8a 100644 --- a/src/slurmd/slurmd/get_mach_stat.c +++ b/src/slurmd/slurmd/get_mach_stat.c @@ -64,6 +64,16 @@ #include <stdlib.h> #include <string.h> #include <syslog.h> + +#ifdef HAVE_AIX +# include <sys/times.h> +# include <sys/types.h> +#else +/* NOTE: Getting the system uptime on AIX uses completely different logic. + * sys/sysinfo.h on AIX defines structures that conflict with SLURM code. */ +# include <sys/sysinfo.h> +#endif + #include <sys/utsname.h> #ifdef HAVE_SYS_VFS_H # include <sys/vfs.h> @@ -79,17 +89,17 @@ static char* _cpuinfo_path = "/proc/cpuinfo"; -int compute_block_map(uint16_t numproc, - uint16_t **block_map, uint16_t **block_map_inv); -int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr); -int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val); -int chk_cpuinfo_float(char *buffer, char *keyword, float *val); +static int _compute_block_map(uint16_t numproc, + uint16_t **block_map, uint16_t **block_map_inv); +static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr); +static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val); /* #define DEBUG_DETAIL 1 */ /* enable detailed debugging within SLURM */ #if DEBUG_MODULE #define DEBUG_DETAIL 1 -#define debug0 printf +#define error printf +#define debug printf #define debug1 printf #define debug2 printf #define debug3 printf @@ -110,6 +120,8 @@ main(int argc, char * argv[]) char node_name[MAX_SLURM_NAME]; float speed; uint16_t testnumproc = 0; + uint32_t up_time = 0; + int days, hours, mins, secs; if (argc > 1) { _cpuinfo_path = argv[1]; @@ -132,6 +144,7 @@ main(int argc, char * argv[]) xfree(block_map_inv); /* not used here */ error_code += get_memory(&this_node.real_memory); error_code += get_tmp_disk(&this_node.tmp_disk, "/tmp"); + error_code += get_up_time(&up_time); #ifdef USE_CPU_SPEED error_code += get_speed(&speed); #endif @@ -142,6 +155,12 @@ main(int argc, char * argv[]) this_node.sockets, this_node.cores, this_node.threads); debug3("\tRealMemory=%u TmpDisk=%u Speed=%f\n", this_node.real_memory, this_node.tmp_disk, speed); + secs = up_time % 60; + mins = (up_time / 60) % 60; + hours = (up_time / 3600) % 24; + days = (up_time / 86400); + debug3("\tUpTime=%u=%u-%2.2u:%2.2u:%2.2u\n", + up_time, days, hours, mins, secs); if (error_code != 0) debug3("get_mach_stat error_code=%d encountered\n", error_code); exit (error_code); @@ -368,8 +387,37 @@ get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs) return error_code; } +extern int get_up_time(uint32_t *up_time) +{ +#ifdef HAVE_AIX + clock_t tm; + struct tms buf; + + tm = times(&buf); + if (tm == (clock_t) -1) { + *up_time = 0; + return errno; + } + + *up_time = tm / sysconf(_SC_CLK_TCK); +#else + /* NOTE for Linux: The return value of times() may overflow the + * possible range of type clock_t. There is also an offset of + * 429 million seconds on some implementations. We just use the + * simpler sysinfo() function instead. */ + struct sysinfo info; + + if (sysinfo(&info) < 0) { + *up_time = 0; + return errno; + } + + *up_time = info.uptime; +#endif + return 0; +} -/* chk_cpuinfo_str +/* _chk_cpuinfo_str * check a line of cpuinfo data (buffer) for a keyword. If it * exists, return the string value for that keyword in *valptr. * Input: buffer - single line of cpuinfo data @@ -377,7 +425,7 @@ get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs) * Output: valptr - string value corresponding to keyword * return code - true if keyword found, false if not found */ -int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr) +static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr) { char *ptr; if (strncmp(buffer, keyword, strlen(keyword))) @@ -390,7 +438,7 @@ int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr) return true; } -/* chk_cpuinfo_uint32 +/* _chk_cpuinfo_uint32 * check a line of cpuinfo data (buffer) for a keyword. If it * exists, return the uint16 value for that keyword in *valptr. * Input: buffer - single line of cpuinfo data @@ -398,18 +446,18 @@ int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr) * Output: valptr - uint32 value corresponding to keyword * return code - true if keyword found, false if not found */ -int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val) +static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val) { char *valptr; - if (chk_cpuinfo_str(buffer, keyword, &valptr)) { + if (_chk_cpuinfo_str(buffer, keyword, &valptr)) { *val = strtoul(valptr, (char **)NULL, 10); return true; } else { return false; } } - -/* chk_cpuinfo_float +#ifdef USE_CPU_SPEED +/* _chk_cpuinfo_float * check a line of cpuinfo data (buffer) for a keyword. If it * exists, return the float value for that keyword in *valptr. * Input: buffer - single line of cpuinfo data @@ -417,10 +465,10 @@ int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val) * Output: valptr - float value corresponding to keyword * return code - true if keyword found, false if not found */ -int chk_cpuinfo_float(char *buffer, char *keyword, float *val) +static int _chk_cpuinfo_float(char *buffer, char *keyword, float *val) { char *valptr; - if (chk_cpuinfo_str(buffer, keyword, &valptr)) { + if (_chk_cpuinfo_str(buffer, keyword, &valptr)) { *val = (float) strtod(valptr, (char **)NULL); return true; } else { @@ -428,7 +476,6 @@ int chk_cpuinfo_float(char *buffer, char *keyword, float *val) } } -#ifdef USE_CPU_SPEED /* * get_speed - Return the speed of procs on this system (MHz clock) * Input: procs - buffer for the CPU speed @@ -444,12 +491,12 @@ get_speed(float *speed) *speed = 1.0; cpu_info_file = fopen(_cpuinfo_path, "r"); if (cpu_info_file == NULL) { - error ("get_speed: error %d opening %s\n", errno, _cpuinfo_path); + error("get_speed: error %d opening %s\n", errno, _cpuinfo_path); return errno; } while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) { - chk_cpuinfo_float(buffer, "cpu MHz", speed); + _chk_cpuinfo_float(buffer, "cpu MHz", speed); } fclose(cpu_info_file); @@ -533,7 +580,7 @@ get_cpuinfo(uint16_t numproc, curcpu = 0; while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) { uint32_t val; - if (chk_cpuinfo_uint32(buffer, "processor", &val)) { + if (_chk_cpuinfo_uint32(buffer, "processor", &val)) { numcpu++; curcpu = val; if (val >= numproc) { /* out of bounds, ignore */ @@ -545,7 +592,7 @@ get_cpuinfo(uint16_t numproc, cpuinfo[val].cpuid = val; maxcpuid = MAX(maxcpuid, val); mincpuid = MIN(mincpuid, val); - } else if (chk_cpuinfo_uint32(buffer, "physical id", &val)) { + } else if (_chk_cpuinfo_uint32(buffer, "physical id", &val)) { /* see if the ID has already been seen */ for (i=0; i<numproc; i++) { if ((cpuinfo[i].physid == val) @@ -566,7 +613,7 @@ get_cpuinfo(uint16_t numproc, maxphysid = MAX(maxphysid, val); minphysid = MIN(minphysid, val); - } else if (chk_cpuinfo_uint32(buffer, "core id", &val)) { + } else if (_chk_cpuinfo_uint32(buffer, "core id", &val)) { /* see if the ID has already been seen */ for (i = 0; i < numproc; i++) { if ((cpuinfo[i].coreid == val) @@ -587,7 +634,7 @@ get_cpuinfo(uint16_t numproc, maxcoreid = MAX(maxcoreid, val); mincoreid = MIN(mincoreid, val); - } else if (chk_cpuinfo_uint32(buffer, "siblings", &val)) { + } else if (_chk_cpuinfo_uint32(buffer, "siblings", &val)) { /* Note: this value is a count, not an index */ if (val > numproc) { /* out of bounds, ignore */ debug("siblings is %u (> %d), ignored", @@ -598,7 +645,7 @@ get_cpuinfo(uint16_t numproc, cpuinfo[curcpu].siblings = val; maxsibs = MAX(maxsibs, val); minsibs = MIN(minsibs, val); - } else if (chk_cpuinfo_uint32(buffer, "cpu cores", &val)) { + } else if (_chk_cpuinfo_uint32(buffer, "cpu cores", &val)) { /* Note: this value is a count, not an index */ if (val > numproc) { /* out of bounds, ignore */ debug("cores is %u (> %d), ignored", @@ -695,7 +742,7 @@ get_cpuinfo(uint16_t numproc, #endif *block_map_size = numcpu; - retval = compute_block_map(*block_map_size, block_map, block_map_inv); + retval = _compute_block_map(*block_map_size, block_map, block_map_inv); xfree(cpuinfo); /* done with raw cpuinfo data */ @@ -703,7 +750,7 @@ get_cpuinfo(uint16_t numproc, } /* - * compute_block_map - Compute abstract->machine block mapping (and inverse) + * _compute_block_map - Compute abstract->machine block mapping (and inverse) * allows computation of CPU ID masks for an abstract block distribution * of logical processors which can then be mapped the IDs used in the * actual machine processor ID ordering (which can be BIOS/OS dependendent) @@ -764,7 +811,7 @@ static int _icmp32(uint32_t a, uint32_t b) } } -int _compare_cpus(const void *a1, const void *b1) { +static int _compare_cpus(const void *a1, const void *b1) { uint16_t *a = (uint16_t *) a1; uint16_t *b = (uint16_t *) b1; int cmp; @@ -785,8 +832,8 @@ int _compare_cpus(const void *a1, const void *b1) { return cmp; } -int compute_block_map(uint16_t numproc, - uint16_t **block_map, uint16_t **block_map_inv) +static int _compute_block_map(uint16_t numproc, + uint16_t **block_map, uint16_t **block_map_inv) { uint16_t i; /* Compute abstract->machine block mapping (and inverse) */ diff --git a/src/slurmd/slurmd/get_mach_stat.h b/src/slurmd/slurmd/get_mach_stat.h index 1fcb78121066ea5043273f130691936579221219..f82ba29f47cb0ae046f1caf6e1da7dc9904bd224 100644 --- a/src/slurmd/slurmd/get_mach_stat.h +++ b/src/slurmd/slurmd/get_mach_stat.h @@ -60,6 +60,7 @@ extern int get_cpuinfo(uint16_t numproc, extern int get_mach_name(char *node_name); extern int get_memory(uint32_t *real_memory); extern int get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs); +extern int get_up_time(uint32_t *up_time); #ifdef USE_OS_NAME extern int get_os_name(char *os_name); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 56473ee02c3df13e39ace249a88cc78272aaa8ed..5410d4311f49a3a97d72401a16427b28929bc8e8 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -154,7 +154,7 @@ static gids_t *_gids_cache_lookup(char *user, gid_t gid); static List waiters; static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER; -static time_t booted = 0; +static time_t startup = 0; /* daemon startup time */ static time_t last_slurmctld_msg = 0; static pthread_mutex_t job_limits_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -175,8 +175,8 @@ slurmd_req(slurm_msg_t *msg) int rc; if (msg == NULL) { - if (booted == 0) - booted = time(NULL); + if (startup == 0) + startup = time(NULL); if (waiters) { list_destroy(waiters); waiters = NULL; @@ -1731,7 +1731,7 @@ _rpc_daemon_status(slurm_msg_t *msg) resp->actual_threads = conf->actual_threads; resp->actual_real_mem = conf->real_memory_size; resp->actual_tmp_disk = conf->tmp_disk_space; - resp->booted = booted; + resp->booted = startup; resp->hostname = xstrdup(conf->node_name); resp->step_list = _get_step_list(); resp->last_slurmctld_msg = last_slurmctld_msg; diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index 77c1b8145f0e99784069e5c3b7106e5d1f8707fa..58cc1cc5bdac39ff7b46d6875389d6a0984dafb3 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -72,6 +72,7 @@ #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" #include "src/common/parse_spec.h" +#include "src/common/parse_time.h" #include "src/common/hostlist.h" #include "src/common/macros.h" #include "src/common/fd.h" @@ -121,33 +122,37 @@ typedef struct connection { static sig_atomic_t _shutdown = 0; static sig_atomic_t _reconfig = 0; static pthread_t msg_pthread = (pthread_t) 0; +static time_t sent_reg_time = (time_t) 0; -static void _term_handler(int); +static void _atfork_final(void); +static void _atfork_prepare(void); +static void _create_msg_socket(void); +static void _decrement_thd_count(void); +static void _destroy_conf(void); +static void _fill_registration_msg(slurm_node_registration_status_msg_t *); +static void _handle_connection(slurm_fd fd, slurm_addr *client); static void _hup_handler(int); +static void _increment_thd_count(void); +static void _init_conf(void); +static void _install_fork_handlers(void); +static void _kill_old_slurmd(void); +static void _msg_engine(void); +static void _print_conf(void); static void _process_cmdline(int ac, char **av); -static void _create_msg_socket(); -static void _msg_engine(); -static int _slurmd_init(); -static int _slurmd_fini(); -static void _init_conf(); -static void _destroy_conf(); -static void _print_conf(); -static void _read_config(); -static void _kill_old_slurmd(); -static void _reconfigure(); +static void _read_config(void); +static void _reconfigure(void); +static void *_registration_engine(void *arg); static int _restore_cred_state(slurm_cred_ctx_t ctx); -static void _increment_thd_count(); -static void _decrement_thd_count(); -static void _wait_for_all_threads(); -static int _set_slurmd_spooldir(void); -static void _usage(); -static void _handle_connection(slurm_fd fd, slurm_addr *client); static void *_service_connection(void *); -static void _fill_registration_msg(slurm_node_registration_status_msg_t *); +static int _set_slurmd_spooldir(void); +static int _slurmd_init(void); +static int _slurmd_fini(void); +static void _spawn_registration_engine(void); +static void _term_handler(int); static void _update_logging(void); -static void _atfork_prepare(void); -static void _atfork_final(void); -static void _install_fork_handlers(void); +static void _usage(void); +static void _wait_for_all_threads(void); + int main (int argc, char *argv[]) @@ -193,11 +198,11 @@ main (int argc, char *argv[]) char *curr_user = NULL; /* since when you do a getpwuid you get a pointer to a - structure you have to do a xstrdup on the first - call or your information will just get over - written. This is a memory leak, but a fatal is - called right after so it isn't that big of a deal. - */ + * structure you have to do a xstrdup on the first + * call or your information will just get over + * written. This is a memory leak, but a fatal is + * called right after so it isn't that big of a deal. + */ if ((pw=getpwuid(slurmd_uid))) slurmd_user = xstrdup(pw->pw_name); if ((pw=getpwuid(curr_uid))) @@ -283,13 +288,11 @@ main (int argc, char *argv[]) info("%s started on %T", xbasename(argv[0])); - if (send_registration_msg(SLURM_SUCCESS, true) < 0) - error("Unable to register with slurm controller"); - _install_fork_handlers(); list_install_fork_handlers(); slurm_conf_install_fork_handlers(); - + + _spawn_registration_engine(); _msg_engine(); /* @@ -314,9 +317,59 @@ main (int argc, char *argv[]) return 0; } +static void +_spawn_registration_engine(void) +{ + int rc; + pthread_attr_t attr; + pthread_t id; + int retries = 0; + + slurm_attr_init(&attr); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (rc != 0) { + errno = rc; + fatal("Unable to set detachstate on attr: %m"); + slurm_attr_destroy(&attr); + return; + } + + while (pthread_create(&id, &attr, &_registration_engine, NULL)) { + error("msg_engine: pthread_create: %m"); + if (++retries > 3) + fatal("msg_engine: pthread_create: %m"); + usleep(10); /* sleep and again */ + } + + return; +} + +/* Spawn a thread to make sure we send at least one registration message to + * slurmctld. If slurmctld restarts, it will request another registration + * message. */ +static void * +_registration_engine(void *arg) +{ + _increment_thd_count(); + + while (!_shutdown) { + if ((sent_reg_time == (time_t) 0) && + (send_registration_msg(SLURM_SUCCESS, true) != + SLURM_SUCCESS)) { + debug("Unable to register with slurm controller, " + "retrying"); + } else if (_shutdown || sent_reg_time) { + break; + } + sleep(1); + } + + _decrement_thd_count(); + return NULL; +} static void -_msg_engine() +_msg_engine(void) { slurm_fd sock; @@ -350,7 +403,7 @@ static void _decrement_thd_count(void) { slurm_mutex_lock(&active_mutex); - if(active_threads>0) + if (active_threads>0) active_threads--; pthread_cond_signal(&active_cond); slurm_mutex_unlock(&active_mutex); @@ -375,7 +428,7 @@ _increment_thd_count(void) } static void -_wait_for_all_threads() +_wait_for_all_threads(void) { slurm_mutex_lock(&active_mutex); while (active_threads > 0) { @@ -460,10 +513,10 @@ cleanup: return NULL; } -int +extern int send_registration_msg(uint32_t status, bool startup) { - int retval = SLURM_SUCCESS; + int ret_val = SLURM_SUCCESS; slurm_msg_t req; slurm_msg_t resp; slurm_node_registration_status_msg_t *msg = @@ -481,15 +534,17 @@ send_registration_msg(uint32_t status, bool startup) if (slurm_send_recv_controller_msg(&req, &resp) < 0) { error("Unable to register: %m"); - retval = SLURM_FAILURE; - } else - slurm_free_return_code_msg(resp.data); + ret_val = SLURM_FAILURE; + } else { + sent_reg_time = time(NULL); + slurm_free_return_code_msg(resp.data); + } slurm_free_node_registration_status_msg (msg); /* XXX look at response msg */ - return SLURM_SUCCESS; + return ret_val; } static void @@ -511,17 +566,20 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) msg->real_memory = conf->real_memory_size; msg->tmp_disk = conf->tmp_disk_space; + get_up_time(&conf->up_time); + msg->up_time = conf->up_time; + if (first_msg) { first_msg = false; info("Procs=%u Sockets=%u Cores=%u Threads=%u " - "Memory=%u TmpDisk=%u", + "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->sockets, msg->cores, msg->threads, - msg->real_memory, msg->tmp_disk); + msg->real_memory, msg->tmp_disk, msg->up_time); } else { debug3("Procs=%u Sockets=%u Cores=%u Threads=%u " - "Memory=%u TmpDisk=%u", + "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->sockets, msg->cores, msg->threads, - msg->real_memory, msg->tmp_disk); + msg->real_memory, msg->tmp_disk, msg->up_time); } uname(&buf); if ((arch = getenv("SLURM_ARCH"))) @@ -609,7 +667,7 @@ _massage_pathname(char **path) * values into the slurmd configuration in preference of the defaults. */ static void -_read_config() +_read_config(void) { char *path_pubkey = NULL; slurm_ctl_conf_t *cf = NULL; @@ -679,6 +737,7 @@ _read_config() conf->threads = conf->actual_threads; get_memory(&conf->real_memory_size); + get_up_time(&conf->up_time); cf = slurm_conf_lock(); get_tmp_disk(&conf->tmp_disk_space, cf->tmp_fs); @@ -706,6 +765,7 @@ _read_config() fatal("Unable to establish controller machine"); if (cf->slurmctld_port == 0) fatal("Unable to establish controller port"); + conf->slurmd_timeout = cf->slurmd_timeout; conf->use_pam = cf->use_pam; conf->task_plugin_param = cf->task_plugin_param; @@ -742,10 +802,10 @@ _reconfigure(void) } static void -_print_conf() +_print_conf(void) { slurm_ctl_conf_t *cf; - char *str; + char *str, time_str[32]; int i; cf = slurm_conf_lock(); @@ -768,6 +828,10 @@ _print_conf() conf->threads, conf->conf_threads, conf->actual_threads); + + secs2time_str((time_t)conf->up_time, time_str, sizeof(time_str)); + debug3("UpTime = %u = %s", conf->up_time, time_str); + str = xmalloc(conf->block_map_size*5); str[0] = '\0'; for (i = 0; i < conf->block_map_size; i++) { @@ -811,7 +875,7 @@ _print_conf() /* Initialize slurmd configuration table. * Everything is already NULL/zero filled when called */ static void -_init_conf() +_init_conf(void) { char host[MAXHOSTNAMELEN]; log_options_t lopts = LOG_OPTS_INITIALIZER; @@ -833,7 +897,7 @@ _init_conf() } static void -_destroy_conf() +_destroy_conf(void) { if(conf) { xfree(conf->block_map); @@ -902,7 +966,7 @@ _process_cmdline(int ac, char **av) exit(0); break; default: - _usage(c); + _usage(); exit(1); break; } @@ -911,7 +975,7 @@ _process_cmdline(int ac, char **av) static void -_create_msg_socket() +_create_msg_socket(void) { char* node_addr; @@ -940,7 +1004,7 @@ _create_msg_socket() static int -_slurmd_init() +_slurmd_init(void) { struct rlimit rlim; slurm_ctl_conf_t *cf; @@ -1097,7 +1161,7 @@ cleanup: * All allocated memory should be freed \**************************************************************************/ static int -_slurmd_fini() +_slurmd_fini(void) { save_cred_state(conf->vctx); int slurm_proctrack_init(); diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h index 42680fc7eb430319730dada02104750364e2dee0..f4a89129b716c30b6c78e78311686d69cfffac79 100644 --- a/src/slurmd/slurmd/slurmd.h +++ b/src/slurmd/slurmd/slurmd.h @@ -90,6 +90,7 @@ typedef struct slurmd_config { uint16_t actual_threads; /* actual thread per core count */ uint32_t real_memory_size; /* amount of real memory */ uint32_t tmp_disk_space; /* size of temporary disk */ + uint32_t up_time; /* seconds since last boot time */ uint16_t block_map_size; /* size of block map */ uint16_t *block_map; /* abstract->machine block map */ uint16_t *block_map_inv; /* machine->abstract (inverse) map */ @@ -121,6 +122,7 @@ typedef struct slurmd_config { slurm_cred_ctx_t vctx; /* slurm_cred_t verifier context */ + uint16_t slurmd_timeout; /* SlurmdTimeout */ uid_t slurm_user_id; /* UID that slurmctld runs as */ pthread_mutex_t config_mutex; /* lock for slurmd_config access */ uint16_t job_acct_gather_freq;