diff --git a/src/common/slurm_jobacct.c b/src/common/slurm_jobacct.c index 87d1cd5f789043485fc47497de64de175e12c5ca..055ea714e17e6bfea25bbfcfa44f6bf07de5a3a4 100644 --- a/src/common/slurm_jobacct.c +++ b/src/common/slurm_jobacct.c @@ -86,31 +86,6 @@ typedef struct slurm_jobacct_ops { void (*jobacct_suspendpoll) (); } slurm_jobacct_ops_t; -/* - * These strings must be in the same order as the fields declared - * for slurm_jobacct_ops_t. - */ -static const char *syms[] = { - "jobacct_p_init_struct", - "jobacct_p_alloc", - "jobacct_p_free", - "jobacct_p_setinfo", - "jobacct_p_getinfo", - "jobacct_p_aggregate", - "jobacct_p_pack", - "jobacct_p_unpack", - "jobacct_p_init_slurmctld", - "jobacct_p_fini_slurmctld", - "jobacct_p_job_start_slurmctld", - "jobacct_p_job_complete_slurmctld", - "jobacct_p_step_start_slurmctld", - "jobacct_p_step_complete_slurmctld", - "jobacct_p_suspend_slurmctld", - "jobacct_p_startpoll", - "jobacct_p_endpoll", - "jobacct_p_suspendpoll", -}; - /* * A global job accounting context. "Global" in the sense that there's * only one, with static bindings. We don't export it. @@ -184,7 +159,31 @@ _slurm_jobacct_context_destroy( slurm_jobacct_context_t c ) static slurm_jobacct_ops_t * _slurm_jobacct_get_ops( slurm_jobacct_context_t c ) { - int n_syms = sizeof( syms ) / sizeof( char * ); + /* + * These strings must be in the same order as the fields declared + * for slurm_jobacct_ops_t. + */ + static const char *syms[] = { + "jobacct_p_init_struct", + "jobacct_p_alloc", + "jobacct_p_free", + "jobacct_p_setinfo", + "jobacct_p_getinfo", + "jobacct_p_aggregate", + "jobacct_p_pack", + "jobacct_p_unpack", + "jobacct_p_init_slurmctld", + "jobacct_p_fini_slurmctld", + "jobacct_p_job_start_slurmctld", + "jobacct_p_job_complete_slurmctld", + "jobacct_p_step_start_slurmctld", + "jobacct_p_step_complete_slurmctld", + "jobacct_p_suspend_slurmctld", + "jobacct_p_startpoll", + "jobacct_p_endpoll", + "jobacct_p_suspendpoll" + }; + int n_syms = sizeof( syms ) / sizeof( char * ); int rc = 0; /* Get the plugin list, if needed. */ if ( c->plugin_list == NULL ) { @@ -303,6 +302,10 @@ extern jobacctinfo_t *jobacct_g_alloc() extern int jobacct_g_free(jobacctinfo_t *jobacct) { int retval = SLURM_SUCCESS; + + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_free))(jobacct); @@ -316,6 +319,9 @@ extern int jobacct_g_setinfo(jobacctinfo_t *jobacct, { int retval = SLURM_SUCCESS; + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_setinfo)) @@ -329,6 +335,9 @@ extern int jobacct_g_getinfo(jobacctinfo_t *jobacct, { int retval = SLURM_SUCCESS; + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_getinfo)) @@ -339,6 +348,9 @@ extern int jobacct_g_getinfo(jobacctinfo_t *jobacct, extern void jobacct_g_aggregate(jobacctinfo_t *dest, jobacctinfo_t *from) { + if (_slurm_jobacct_init() < 0) + return; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) (*(g_jobacct_context->ops.jobacct_aggregate))(dest, from); @@ -348,6 +360,9 @@ extern void jobacct_g_aggregate(jobacctinfo_t *dest, jobacctinfo_t *from) extern void jobacct_g_pack(jobacctinfo_t *jobacct, Buf buffer) { + if (_slurm_jobacct_init() < 0) + return; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) (*(g_jobacct_context->ops.jobacct_pack))(jobacct, buffer); @@ -359,6 +374,9 @@ extern int jobacct_g_unpack(jobacctinfo_t **jobacct, Buf buffer) { int retval = SLURM_SUCCESS; + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_unpack)) @@ -384,6 +402,9 @@ extern int jobacct_g_init_slurmctld(char *job_acct_log) extern int jobacct_g_fini_slurmctld() { int retval = SLURM_SUCCESS; + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_fini))(); @@ -397,7 +418,9 @@ extern int jobacct_g_fini_slurmctld() extern int jobacct_g_job_start_slurmctld(struct job_record *job_ptr) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_job_start)) @@ -409,7 +432,9 @@ extern int jobacct_g_job_start_slurmctld(struct job_record *job_ptr) extern int jobacct_g_job_complete_slurmctld(struct job_record *job_ptr) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_job_complete)) @@ -421,7 +446,9 @@ extern int jobacct_g_job_complete_slurmctld(struct job_record *job_ptr) extern int jobacct_g_step_start_slurmctld(struct step_record *step_ptr) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_step_start)) @@ -433,7 +460,9 @@ extern int jobacct_g_step_start_slurmctld(struct step_record *step_ptr) extern int jobacct_g_step_complete_slurmctld(struct step_record *step_ptr) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_step_complete)) @@ -445,7 +474,9 @@ extern int jobacct_g_step_complete_slurmctld(struct step_record *step_ptr) extern int jobacct_g_suspend_slurmctld(struct job_record *job_ptr) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_suspend)) @@ -457,6 +488,8 @@ extern int jobacct_g_suspend_slurmctld(struct job_record *job_ptr) extern int jobacct_g_startpoll(int frequency) { int retval = SLURM_SUCCESS; + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; if (_slurm_jobacct_init() < 0) return SLURM_ERROR; @@ -473,7 +506,9 @@ extern int jobacct_g_startpoll(int frequency) extern int jobacct_g_endpoll(slurmd_job_t *job) { int retval = SLURM_SUCCESS; - + if (_slurm_jobacct_init() < 0) + return SLURM_ERROR; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) retval = (*(g_jobacct_context->ops.jobacct_endpoll))(job); @@ -483,6 +518,9 @@ extern int jobacct_g_endpoll(slurmd_job_t *job) extern void jobacct_g_suspendpoll() { + if (_slurm_jobacct_init() < 0) + return; + slurm_mutex_lock( &g_jobacct_context_lock ); if ( g_jobacct_context ) (*(g_jobacct_context->ops.jobacct_suspendpoll))(); diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 98ed476a67cf1812c69726db6d0b5821d24402e3..6cf3bc0a255925007bcb8afbee33001507be6e09 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -765,7 +765,7 @@ List slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout) msg->forward_struct->timeout = timeout-header.forward.timeout; msg->forward_struct->fwd_cnt = header.forward.cnt; - debug("forwarding messages to %d nodes!!!!", + debug3("forwarding messages to %d nodes!!!!", msg->forward_struct->fwd_cnt); if(forward_msg(msg->forward_struct, &header) == SLURM_ERROR) { @@ -978,7 +978,7 @@ int slurm_send_node_msg(slurm_fd fd, slurm_msg_t * msg) /* wait for all the other messages on the tree under us */ if(msg->forward_struct_init == FORWARD_INIT && msg->forward_struct) { - debug("looking for %d", msg->forward_struct->fwd_cnt); + debug3("looking for %d", msg->forward_struct->fwd_cnt); slurm_mutex_lock(&msg->forward_struct->forward_mutex); count = 0; itr = list_iterator_create(msg->ret_list); @@ -987,7 +987,7 @@ int slurm_send_node_msg(slurm_fd fd, slurm_msg_t * msg) count += list_count(ret_type->ret_data_list); } list_iterator_destroy(itr); - debug("Got back %d", count); + debug3("Got back %d", count); while((count < msg->forward_struct->fwd_cnt)) { pthread_cond_wait(&msg->forward_struct->notify, &msg->forward_struct->forward_mutex); @@ -998,10 +998,10 @@ int slurm_send_node_msg(slurm_fd fd, slurm_msg_t * msg) count += list_count(ret_type->ret_data_list); } list_iterator_destroy(itr); - debug("Got back %d", count); + debug3("Got back %d", count); } - debug("Got them all"); + debug3("Got them all"); slurm_mutex_unlock(&msg->forward_struct->forward_mutex); destroy_forward_struct(msg->forward_struct); } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index a8523d1fbc37f6a6936b396469e2ed51bba995b6..488c50674c5bdc0a27dded6ff8cabf2d3b603e3b 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -683,4 +683,53 @@ extern char *job_state_string_compact(enum job_states inx); extern char *node_state_string(enum node_states inx); extern char *node_state_string_compact(enum node_states inx); +#define safe_read(fd, buf, size) do { \ + int remaining = size; \ + void *ptr = buf; \ + int rc; \ + while (remaining > 0) { \ + rc = read(fd, ptr, remaining); \ + if (rc == 0) { \ + debug("%s:%d: %s: safe_read (%d of %d) EOF", \ + __FILE__, __LINE__, __CURRENT_FUNC__, \ + remaining, (int)size); \ + goto rwfail; \ + } else if (rc < 0) { \ + debug("%s:%d: %s: safe_read (%d of %d) failed: %m", \ + __FILE__, __LINE__, __CURRENT_FUNC__, \ + remaining, (int)size); \ + goto rwfail; \ + } else { \ + ptr += rc; \ + remaining -= rc; \ + if (remaining > 0) \ + debug3("%s:%d: %s: safe_read (%d of %d) partial read", \ + __FILE__, __LINE__, __CURRENT_FUNC__, \ + remaining, (int)size); \ + } \ + } \ + } while (0) + +#define safe_write(fd, buf, size) do { \ + int remaining = size; \ + void *ptr = buf; \ + int rc; \ + while(remaining > 0) { \ + rc = write(fd, ptr, remaining); \ + if (rc < 0) { \ + debug("%s:%d: %s: safe_write (%d of %d) failed: %m", \ + __FILE__, __LINE__, __CURRENT_FUNC__, \ + remaining, (int)size); \ + goto rwfail; \ + } else { \ + ptr += rc; \ + remaining -= rc; \ + if (remaining > 0) \ + debug3("%s:%d: %s: safe_write (%d of %d) partial write", \ + __FILE__, __LINE__, __CURRENT_FUNC__, \ + remaining, (int)size); \ + } \ + } \ + } while (0) + #endif diff --git a/src/plugins/jobacct/common/jobacct_common.c b/src/plugins/jobacct/common/jobacct_common.c index de1ed72311dd2dc50f33f4ec86a3ec60976172f6..462868bed5521c5a67b89614e471eb2439664db3 100644 --- a/src/plugins/jobacct/common/jobacct_common.c +++ b/src/plugins/jobacct/common/jobacct_common.c @@ -71,7 +71,7 @@ extern int common_setinfo(struct jobacctinfo *jobacct, enum jobacct_data_type type, void *data) { int rc = SLURM_SUCCESS; - int *temp = (int *)data; + int *fd = (int *)data; uint32_t *uint32 = (uint32_t *) data; struct rusage *rusage = (struct rusage *) data; struct jobacctinfo *send = (struct jobacctinfo *) data; @@ -81,7 +81,7 @@ extern int common_setinfo(struct jobacctinfo *jobacct, memcpy(jobacct, send, sizeof(struct jobacctinfo)); break; case JOBACCT_DATA_PIPE: - safe_write((int)*temp, jobacct, sizeof(struct jobacctinfo)); + safe_write(*fd, jobacct, sizeof(struct jobacctinfo)); break; case JOBACCT_DATA_RUSAGE: memcpy(&jobacct->rusage, rusage, sizeof(struct rusage)); @@ -107,7 +107,7 @@ extern int common_getinfo(struct jobacctinfo *jobacct, enum jobacct_data_type type, void *data) { int rc = SLURM_SUCCESS; - int *temp = (int *)data; + int *fd = (int *)data; uint32_t *uint32 = (uint32_t *) data; struct rusage *rusage = (struct rusage *) data; struct jobacctinfo *send = (struct jobacctinfo *) data; @@ -117,7 +117,7 @@ extern int common_getinfo(struct jobacctinfo *jobacct, memcpy(send, jobacct, sizeof(struct jobacctinfo)); break; case JOBACCT_DATA_PIPE: - safe_read((int)*temp, jobacct, sizeof(struct jobacctinfo)); + safe_read(*fd, jobacct, sizeof(struct jobacctinfo)); break; case JOBACCT_DATA_RUSAGE: memcpy(rusage, &jobacct->rusage, sizeof(struct rusage)); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index f3eca51a5cd3e3236b4f024622ff40aae044d3bb..4c40197d85e01769b9db8c1f92383d691fef67f6 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -346,6 +346,7 @@ int main(int argc, char *argv[]) /* Plugins are needed to purge job/node data structures, * unplug after other data structures are purged */ g_slurm_jobcomp_fini(); + jobacct_g_fini_slurmctld(); slurm_sched_fini(); slurm_select_fini(); checkpoint_fini(); @@ -361,8 +362,7 @@ int main(int argc, char *argv[]) info("Slurmctld shutdown completing"); log_fini(); - jobacct_g_fini_slurmctld(); - + if (dump_core) abort(); else diff --git a/src/slurmd/common/stepd_api.c b/src/slurmd/common/stepd_api.c index 9cd5e9b15bc66a6535618e8455ba6f6d8c79d997..b351c859c0384f1580518a810934a7512ac8908b 100644 --- a/src/slurmd/common/stepd_api.c +++ b/src/slurmd/common/stepd_api.c @@ -605,7 +605,6 @@ stepd_completion(int fd, step_complete_msg_t *sent) safe_write(fd, &sent->range_last, sizeof(int)); safe_write(fd, &sent->step_rc, sizeof(int)); jobacct_g_setinfo(sent->jobacct, JOBACCT_DATA_PIPE, &fd); - /* Receive the return code and errno */ safe_read(fd, &rc, sizeof(int)); safe_read(fd, &errnum, sizeof(int)); diff --git a/src/slurmd/common/stepd_api.h b/src/slurmd/common/stepd_api.h index 1a7d56c04c94ba20f3f3363969e1fa5004cf8df9..7523e3564008e87741ceb3da3ca49aaaac82913f 100644 --- a/src/slurmd/common/stepd_api.h +++ b/src/slurmd/common/stepd_api.h @@ -177,53 +177,4 @@ int stepd_resume(int fd); */ int stepd_completion(int fd, step_complete_msg_t *sent); -#define safe_read(fd, buf, size) do { \ - int remaining = size; \ - void *ptr = buf; \ - int rc; \ - while (remaining > 0) { \ - rc = read(fd, ptr, remaining); \ - if (rc == 0) { \ - debug("%s:%d: %s: safe_read (%d of %d) EOF", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - remaining, (int)size); \ - goto rwfail; \ - } else if (rc < 0) { \ - debug("%s:%d: %s: safe_read (%d of %d) failed: %m", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - remaining, (int)size); \ - goto rwfail; \ - } else { \ - ptr += rc; \ - remaining -= rc; \ - if (remaining > 0) \ - debug3("%s:%d: %s: safe_read (%d of %d) partial read", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - remaining, (int)size); \ - } \ - } \ - } while (0) - -#define safe_write(fd, buf, size) do { \ - int remaining = size; \ - void *ptr = buf; \ - int rc; \ - while(remaining > 0) { \ - rc = write(fd, ptr, remaining); \ - if (rc < 0) { \ - debug("%s:%d: %s: safe_write (%d of %d) failed: %m", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - remaining, (int)size); \ - goto rwfail; \ - } else { \ - ptr += rc; \ - remaining -= rc; \ - if (remaining > 0) \ - debug3("%s:%d: %s: safe_write (%d of %d) partial write", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - remaining, (int)size); \ - } \ - } \ - } while (0) - #endif /* _STEPD_API_H */ diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index e2aeda866f53c97785887e9e94e7cb71704c1cfa..4de222b730c135dfae47e6f5039d0f240426d5b7 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -1009,7 +1009,6 @@ _handle_completion(int fd, slurmd_job_t *job, uid_t uid) step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ - info("read it all now aggregate"); jobacct_g_aggregate(step_complete.jobacct, jobacct); jobacct_g_free(jobacct); /*********************************************/ diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index c95bfc9a97d856f88e3a66caad1e48d2b6f97c60..63cfef0c001e9bb5dca633e6b41752bfe95ecde6 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -204,7 +204,7 @@ _init_from_slurmd(int sock, char **argv, conf->log_opts.stderr_level = LOG_LEVEL_QUIET; if (conf->logfile) conf->log_opts.syslog_level = LOG_LEVEL_QUIET; - } else + } else conf->log_opts.syslog_level = LOG_LEVEL_QUIET; log_init(argv[0],conf->log_opts, LOG_DAEMON, conf->logfile); diff --git a/src/srun/msg.c b/src/srun/msg.c index 3d4b289904e56cde0a8f8f20c97bf0e158ef32b6..fff81bf4443aa2d15db37ab7af7430ee7bc5ab3f 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -108,27 +108,6 @@ static void _node_fail_handler(char *nodelist, srun_job_t *job); #define _poll_wr_isset(pfd) ((pfd).revents & POLLOUT) #define _poll_err(pfd) ((pfd).revents & POLLERR) -#undef safe_read -#define safe_read(fd, ptr, size) do { \ - if (read(fd, ptr, size) != size) { \ - debug("%s:%d: %s: read (%d bytes) failed: %m", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - (int)size); \ - goto rwfail; \ - } \ - } while (0) - -#undef safe_write -#define safe_write(fd, ptr, size) do { \ - if (write(fd, ptr, size) != size) { \ - debug("%s:%d: %s: write (%d bytes) failed: %m", \ - __FILE__, __LINE__, __CURRENT_FUNC__, \ - (int)size); \ - goto rwfail; \ - } \ - } while (0) - - /* fd is job->forked_msg->par_msg->msg_pipe[1] */ static void _update_mpir_proctable(int fd, srun_job_t *job, int nodeid, int ntasks, uint32_t *pid,