diff --git a/src/common/qsw.c b/src/common/qsw.c index 76757b012dd2b0d8aa493ad38f98ceeefbea0484..787d16e77cbd980a2813f9df83fba213bc04b56d 100644 --- a/src/common/qsw.c +++ b/src/common/qsw.c @@ -48,6 +48,7 @@ #include <rms/rmscall.h> #include <src/common/bitstring.h> +#include <src/common/log.h> #include <src/common/pack.h> #include <src/common/qsw.h> #include <src/common/slurm_errno.h> @@ -283,6 +284,7 @@ qsw_free_jobinfo(qsw_jobinfo_t j) * data (OUT) where to store packed data * len (IN) max size of data * RETURN #bytes unused in 'data' or -1 on error (sets errno) + * NOTE: Keep in sync with QSW_PACK_SIZE above */ int qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int *len) diff --git a/src/common/qsw.h b/src/common/qsw.h index 2fefc9bbe1fb76709e749c2ff37f65ca1f6111e7..daf9f33b960ca3fb5daa5237277b7dae247f7e71 100644 --- a/src/common/qsw.h +++ b/src/common/qsw.h @@ -37,6 +37,7 @@ typedef struct qsw_jobinfo *qsw_jobinfo_t; #define QSW_LIBSTATE_PACK_MAX 12 #define QSW_JOBINFO_PACK_MAX 120 #define QSW_MAX_TASKS 1024 +#define QSW_PACK_SIZE (4 * (2+4+1+8+ELAN_BITMAPSIZE)) int qsw_alloc_libstate(qsw_libstate_t *lsp); void qsw_free_libstate(qsw_libstate_t ls); @@ -69,5 +70,6 @@ int qsw_getnodeid(void); int qsw_getnodeid_byhost(char *host); int qsw_gethost_bynodeid(char *host, int len, int elanid); +void qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo); #endif /* _QSW_INCLUDED */ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index eb153193ef4d0e43f6e0630897c9d88db8fdeac9..27b8d49d91eaa3c35947bad7e16f3760662cf192 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -236,7 +236,7 @@ slurmctld_signal_hand ( void * no_data ) if (error_code) error ("read_slurm_conf error %d", error_code); break; - case SIBABRT: /* abort */ + case SIGABRT: /* abort */ fatal ("SIGABRT received"); break; default: diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e5ecee97b87c661c43371b83a7a4553482cc5f7c..3e8f716a4e1c4c903731139c46168a00eb85e979 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -42,7 +42,8 @@ #ifdef HAVE_LIBELAN3 #include <elan3/elan3.h> #include <elan3/elanvp.h> -#define BUF_SIZE (1024 + ELAN_MAX_VPS) +#include <src/common/qsw.h> +#define BUF_SIZE (1024 + QSW_PACK_SIZE) #else #define BUF_SIZE 1024 #endif @@ -310,7 +311,7 @@ dump_all_job_state ( void ) fatal ("dump_all_job: job integrity is bad"); buffer_needed = BUF_SIZE; #ifdef HAVE_LIBELAN3 - buffer_needed += (step_count (job_record_point) * ELAN_MAX_VPS / 8); + buffer_needed += (step_count (job_record_point) * QSW_PACK_SIZE); #endif if (buf_len < buffer_needed) { buffer_allocated += buffer_needed; @@ -407,13 +408,6 @@ dump_job_state (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_len) /* Dump job steps */ step_record_iterator = list_iterator_create (dump_job_ptr->step_list); while ((step_record_ptr = (struct step_record *) list_next (step_record_iterator))) { -#ifdef HAVE_LIBELAN3 - if (*buf_len < ((ELAN_MAX_VPS / 8) + 60)) { - fatal ("dump_job_state, buffer space too small for %u.%u", - dump_job_ptr->job_id, step_record_ptr->step_id); - break; - } -#endif pack16 ((uint16_t) STEP_FLAG, buf_ptr, buf_len); dump_job_step_state (step_record_ptr, buf_ptr, buf_len); }; @@ -522,7 +516,7 @@ dump_job_step_state (struct step_record *step_ptr, void **buf_ptr, int *buf_len) packstr (node_list, buf_ptr, buf_len); xfree (node_list); #ifdef HAVE_LIBELAN3 - qsw_pack_jobinfo(step_ptr->qsw_job, (void **)buf_ptr, buf_len); + qsw_pack_jobinfo (step_ptr->qsw_job, (void **)buf_ptr, buf_len); #endif } @@ -731,7 +725,7 @@ load_job_state ( void ) xfree (node_list); } #ifdef HAVE_LIBELAN3 - if (buffer_size < (2 * sizeof (uint16_t))) + if (buffer_size < QSW_PACK_SIZE) break; qsw_alloc_jobinfo(&step_ptr->qsw_job); qsw_unpack_jobinfo(step_ptr->qsw_job, buf_ptr, &buffer_size); diff --git a/src/slurmd/elan_interconnect.c b/src/slurmd/elan_interconnect.c index 6c909c8e6658bb850726a465321a700886210c8a..869469a6084cae1af6f586e33449b45a84ed3fa3 100644 --- a/src/slurmd/elan_interconnect.c +++ b/src/slurmd/elan_interconnect.c @@ -143,18 +143,20 @@ int interconnect_set_capabilities(task_start_t * task_start) /* * Set environment variables needed by QSW MPICH / libelan. */ -int interconnect_env(char ***env, int *envc, int nodeid, int nnodes, +int interconnect_env(char ***env, uint16_t *envc, int nodeid, int nnodes, int procid, int nprocs) { - if (setenvpf(env, envc, "RMS_RANK=%d", procid) < 0) + int cnt = *envc; + + if (setenvpf(env, &cnt, "RMS_RANK=%d", procid) < 0) return -1; - if (setenvpf(env, envc, "RMS_NODEID=%d", nodeid) < 0) + if (setenvpf(env, &cnt, "RMS_NODEID=%d", nodeid) < 0) return -1; - if (setenvpf(env, envc, "RMS_PROCID=%d", procid) < 0) + if (setenvpf(env, &cnt, "RMS_PROCID=%d", procid) < 0) return -1; - if (setenvpf(env, envc, "RMS_NNODES=%d", nnodes) < 0) + if (setenvpf(env, &cnt, "RMS_NNODES=%d", nnodes) < 0) return -1; - if (setenvpf(env, envc, "RMS_NPROCS=%d", nprocs) < 0) + if (setenvpf(env, &cnt, "RMS_NPROCS=%d", nprocs) < 0) return -1; return 0; } diff --git a/src/slurmd/interconnect.h b/src/slurmd/interconnect.h index e5fa4ec387f082af12b8ddde8b55f251399f7088..c1eae5d92011fc929db9a64df738859077e90827 100644 --- a/src/slurmd/interconnect.h +++ b/src/slurmd/interconnect.h @@ -24,4 +24,10 @@ int fan_out_task_launch ( launch_tasks_request_msg_t * launch_msg ); */ int interconnect_set_capabilities ( task_start_t * task_start ) ; +/* + * Set environment variables needed. + */ +int interconnect_env(char ***env, uint16_t *envc, int nodeid, int nnodes, + int procid, int nprocs) ; + #endif diff --git a/src/slurmd/nbio.c b/src/slurmd/nbio.c index 886fb3e39e728a2bad5ec295f9d1fccfec96f9e8..3ea17377e96203b31d7c102d24129358b7fac93b 100644 --- a/src/slurmd/nbio.c +++ b/src/slurmd/nbio.c @@ -129,7 +129,7 @@ int init_nbio_attr(nbio_attr_t * nbio_attr, task_start_t * task_start) return SLURM_SUCCESS; } -void *do_nbio(void *arg) +int do_nbio(void *arg) { nbio_attr_t nbio_attr; task_start_t *task_start = (task_start_t *) arg; diff --git a/src/slurmd/nbio.h b/src/slurmd/nbio.h index 03de1dd0026c2e00150a1279ab2f300a53b3b7bc..8fbaba2c95ccff07745ae1955323a84c93d5fdb2 100644 --- a/src/slurmd/nbio.h +++ b/src/slurmd/nbio.h @@ -1,6 +1,6 @@ #ifndef _SLURMD_NBIO_H #define _SLURMD_NBIO_H -void *do_nbio(void *arg); +int do_nbio(void *arg); #endif diff --git a/src/slurmd/no_interconnect.c b/src/slurmd/no_interconnect.c index 860daba053ef52cef12e99de7a2fd754ed834b5a..cf7add7c9e3bac546db9494c39ca9c8adff3e432 100644 --- a/src/slurmd/no_interconnect.c +++ b/src/slurmd/no_interconnect.c @@ -25,7 +25,7 @@ int interconnect_set_capabilities ( task_start_t * task_start ) /* * Set environment variables needed by QSW MPICH / libelan. */ -int interconnect_env(char ***env, int *envc, int nodeid, int nnodes, +int interconnect_env(char ***env, uint16_t *envc, int nodeid, int nnodes, int procid, int nprocs) { return SLURM_SUCCESS ; diff --git a/src/slurmd/pipes.c b/src/slurmd/pipes.c index e8db564cba8ab0608c5349380442765943eca290..b5b7bfddfc59ec03ff51fe2199704e2d5e88e3b7 100644 --- a/src/slurmd/pipes.c +++ b/src/slurmd/pipes.c @@ -38,7 +38,6 @@ int init_parent_pipes(int *pipes) int setup_child_pipes(int *pipes) { int error_code = SLURM_SUCCESS; - int local_errno; /* dup stdin */ /* close ( STDIN_FILENO ); */ diff --git a/src/slurmd/task_mgr.c b/src/slurmd/task_mgr.c index 96253b32fd1b323245a76de397381b3257339c35..f7dc4f9ecba3e1fde90044edc59433c90fa0deac 100644 --- a/src/slurmd/task_mgr.c +++ b/src/slurmd/task_mgr.c @@ -7,6 +7,7 @@ #include <unistd.h> #include <string.h> #include <pthread.h> +#include <unistd.h> #include <src/common/log.h> #include <src/common/list.h> @@ -20,16 +21,15 @@ #include <src/slurmd/circular_buffer.h> #include <src/slurmd/pipes.h> #include <src/slurmd/io.h> +#include <src/slurmd/interconnect.h> /* global variables */ /* prototypes */ int kill_task(task_t * task, int signal); +extern pid_t getsid(pid_t pid); -int interconnect_init(launch_tasks_request_msg_t * launch_msg); -int fan_out_task_launch(launch_tasks_request_msg_t * launch_msg); int send_task_exit_msg(int task_return_code, task_start_t * task_start); -int interconnect_set_capabilities(task_start_t * task_start); /****************************************************************** *task launch method call hierarchy