diff --git a/src/common/qsw.c b/src/common/qsw.c index e4d5a4909fe1857d298693c67b01647bccb69db5..eec002e28829c3dac918654d54761c2307ef2989 100644 --- a/src/common/qsw.c +++ b/src/common/qsw.c @@ -160,7 +160,7 @@ qsw_pack_libstate(qsw_libstate_t ls, void **data, int *len) pack32(ls->ls_prognum, data, len); pack32(ls->ls_hwcontext, data, len); - return len; + return *len; } /* @@ -182,7 +182,7 @@ qsw_unpack_libstate(qsw_libstate_t ls, void **data, int *len) if (ls->ls_magic != QSW_LIBSTATE_MAGIC) slurm_seterrno_ret(EBADMAGIC_QSWLIBSTATE); /* corrupted libstate */ - return len; + return *len; } /* @@ -285,30 +285,30 @@ qsw_free_jobinfo(qsw_jobinfo_t j) * RETURN #bytes unused in 'data' or -1 on error (sets errno) */ int -qsw_pack_jobinfo(qsw_jobinfo_t j, void *data, int len) +qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int *len) { int i; assert(j->j_magic == QSW_JOBINFO_MAGIC); - pack32(j->j_magic, &data, &len); - pack32(j->j_prognum, &data, &len); + pack32(j->j_magic, data, len); + pack32(j->j_prognum, data, len); for (i = 0; i < 4; i++) - pack32(j->j_cap.UserKey.Values[i], &data, &len); - pack16(j->j_cap.Type, &data, &len); - pack16(j->j_cap.Generation, &data, &len); - pack32(j->j_cap.Version, &data, &len); - pack32(j->j_cap.LowContext, &data, &len); - pack32(j->j_cap.HighContext, &data, &len); - pack32(j->j_cap.MyContext, &data, &len); - pack32(j->j_cap.LowNode, &data, &len); - pack32(j->j_cap.HighNode, &data, &len); - pack32(j->j_cap.Entries, &data, &len); - pack32(j->j_cap.RailMask, &data, &len); + pack32(j->j_cap.UserKey.Values[i], data, len); + pack16(j->j_cap.Type, data, len); + pack16(j->j_cap.Generation, data, len); + pack32(j->j_cap.Version, data, len); + pack32(j->j_cap.LowContext, data, len); + pack32(j->j_cap.HighContext, data, len); + pack32(j->j_cap.MyContext, data, len); + pack32(j->j_cap.LowNode, data, len); + pack32(j->j_cap.HighNode, data, len); + pack32(j->j_cap.Entries, data, len); + pack32(j->j_cap.RailMask, data, len); for (i = 0; i < ELAN_BITMAPSIZE; i++) - pack32(j->j_cap.Bitmap[i], &data, &len); - - return len; + pack32(j->j_cap.Bitmap[i], data, len); + + return *len; } /* @@ -319,33 +319,33 @@ qsw_pack_jobinfo(qsw_jobinfo_t j, void *data, int len) * RETURN #bytes unused in 'data' or -1 on error (sets errno) */ int -qsw_unpack_jobinfo(qsw_jobinfo_t j, void *data, int len) +qsw_unpack_jobinfo(qsw_jobinfo_t j, void **data, int *len) { int i; assert(j->j_magic == QSW_JOBINFO_MAGIC); - unpack32(&j->j_magic, &data, &len); - unpack32(&j->j_prognum, &data, &len); + unpack32(&j->j_magic, data, len); + unpack32(&j->j_prognum, data, len); for (i = 0; i < 4; i++) - unpack32(&j->j_cap.UserKey.Values[i], &data, &len); - unpack16(&j->j_cap.Type, &data, &len); - unpack16(&j->j_cap.Generation, &data, &len); - unpack32(&j->j_cap.Version, &data, &len); - unpack32(&j->j_cap.LowContext, &data, &len); - unpack32(&j->j_cap.HighContext, &data, &len); - unpack32(&j->j_cap.MyContext, &data, &len); - unpack32(&j->j_cap.LowNode, &data, &len); - unpack32(&j->j_cap.HighNode, &data, &len); - unpack32(&j->j_cap.Entries, &data, &len); - unpack32(&j->j_cap.RailMask, &data, &len); + unpack32(&j->j_cap.UserKey.Values[i], data, len); + unpack16(&j->j_cap.Type, data, len); + unpack16(&j->j_cap.Generation, data, len); + unpack32(&j->j_cap.Version, data, len); + unpack32(&j->j_cap.LowContext, data, len); + unpack32(&j->j_cap.HighContext, data, len); + unpack32(&j->j_cap.MyContext, data, len); + unpack32(&j->j_cap.LowNode, data, len); + unpack32(&j->j_cap.HighNode, data, len); + unpack32(&j->j_cap.Entries, data, len); + unpack32(&j->j_cap.RailMask, data, len); for (i = 0; i < ELAN_BITMAPSIZE; i++) - unpack32(&j->j_cap.Bitmap[i], &data, &len); + unpack32(&j->j_cap.Bitmap[i], data, len); if (j->j_magic != QSW_JOBINFO_MAGIC) slurm_seterrno_ret(EBADMAGIC_QSWJOBINFO); - return len; + return *len; } /* @@ -800,7 +800,6 @@ qsw_prgsignal(qsw_jobinfo_t jobinfo, int signum) } -#if 0 #define TRUNC_BITMAP 1 static void _print_capbitmap(FILE *fp, ELAN_CAPABILITY *cap) @@ -812,7 +811,7 @@ _print_capbitmap(FILE *fp, ELAN_CAPABILITY *cap) #endif for (bit = bit_max; bit >= 0; bit--) fprintf(fp, "%c", BT_TEST(cap->Bitmap, bit) ? '1' : '0'); - printf(fp, "\n"); + fprintf(fp, "\n"); } void @@ -843,4 +842,3 @@ qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo) _print_capbitmap(fp, cap); fprintf(fp, "\n------------------\n"); } -#endif diff --git a/src/common/qsw.h b/src/common/qsw.h index 12b46ea20c2c254ab1c5a02a5538df6fcd5fff34..2fefc9bbe1fb76709e749c2ff37f65ca1f6111e7 100644 --- a/src/common/qsw.h +++ b/src/common/qsw.h @@ -50,8 +50,8 @@ void qsw_fini(qsw_libstate_t savestate); int qsw_alloc_jobinfo(qsw_jobinfo_t *jp); void qsw_free_jobinfo(qsw_jobinfo_t j); -int qsw_pack_libstate(qsw_libstate_t ls, void **data, int *len); -int qsw_unpack_jobinfo(qsw_libstate_t ls, void **data, int *len); +int qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int *len); +int qsw_unpack_jobinfo(qsw_jobinfo_t j, void **data, int *len); int qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, bitstr_t *nodeset, int cyclic_alloc); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index ee907b349547a847435734b49d3e039e3b401a21..1cc455e67e7ce3e142e6ba1627c58ea6a1250462 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -420,9 +420,12 @@ typedef struct launch_tasks_request_msg { uint32_t job_id ; uint32_t job_step_id ; + uint32_t nnodes ; /* number of nodes in this job step */ + uint32_t nprocs ; /* number of processes in this job step */ uint32_t uid ; - uint32_t srun_node_id ; - slurm_job_credential_t* credential; + uint32_t srun_node_id ; /* node id of this node (relative to job) */ + + slurm_job_credential_t * credential; /* job credential */ uint32_t tasks_to_launch ; uint16_t envc ; char ** env ; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 483a89d7217553b53c9c98244fc691ececd7ce0e..4e80ae96dbca6c5a39ecf7b49bfee54a161907e9 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -506,7 +506,7 @@ void pack_resource_allocation_and_run_response_msg ( resource_allocation_and_run pack32 ( msg -> job_step_id , ( void ** ) buffer , length ) ; pack_job_credential( msg->credentials, ( void ** ) buffer , length ) ; #ifdef HAVE_LIBELAN3 - slurm_qsw_pack_jobinfo( msg -> qsw_job , (void ** ) buffer , length) ; + qsw_pack_jobinfo( msg -> qsw_job , (void ** ) buffer , length) ; #endif } @@ -541,7 +541,8 @@ int unpack_resource_allocation_and_run_response_msg ( resource_allocation_and_ru unpack32 ( &tmp_ptr -> job_step_id, ( void ** ) buffer , length ) ; unpack_job_credential( &tmp_ptr->credentials, ( void ** ) buffer , length ) ; #ifdef HAVE_LIBELAN3 - slurm_qsw_unpack_jobinfo(& tmp_ptr -> qsw_job , (void **) buffer , length ) ; + qsw_alloc_jobinfo(&tmp_ptr->qsw_job); + qsw_unpack_jobinfo(tmp_ptr->qsw_job , (void **) buffer , length ) ; #endif *msg = tmp_ptr ; @@ -778,33 +779,6 @@ int unpack_job_credential( slurm_job_credential_t** msg , void ** buffer , uint3 return 0; } -#ifdef HAVE_LIBELAN3 -void slurm_qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int * len) -{ - int packlen ; - packlen = qsw_pack_jobinfo(j, *data, *len) ; - if ( packlen > 0 ) - { - *len -= packlen ; - ((char *) *data) += packlen ; - } -} - -int slurm_qsw_unpack_jobinfo(qsw_jobinfo_t * j, void **data, int *len) -{ - int packlen ; - /* *j = xmalloc ( sizeof ( struct qsw_jobinfo ) ) ; */ - qsw_alloc_jobinfo( j ) ; - packlen = qsw_unpack_jobinfo ( *j , *data , * len ) ; - if ( packlen > 0 ) - { - *len -= packlen ; - ((char *) *data) += packlen ; - } - return 0 ; -} -#endif - void pack_job_step_create_response_msg ( job_step_create_response_msg_t* msg , void ** buffer , uint32_t * length ) { assert ( msg != NULL ); @@ -813,7 +787,7 @@ void pack_job_step_create_response_msg ( job_step_create_response_msg_t* msg , packstr ( msg -> node_list, ( void ** ) buffer , length ) ; pack_job_credential( msg->credentials, ( void ** ) buffer , length ) ; #ifdef HAVE_LIBELAN3 - slurm_qsw_pack_jobinfo( msg -> qsw_job , (void ** ) buffer , length) ; + qsw_pack_jobinfo(msg->qsw_job , (void ** ) buffer , length) ; #endif } @@ -833,7 +807,8 @@ int unpack_job_step_create_response_msg (job_step_create_response_msg_t** msg , *msg = tmp_ptr; #ifdef HAVE_LIBELAN3 - slurm_qsw_unpack_jobinfo(& tmp_ptr -> qsw_job , (void **) buffer , length ) ; + qsw_alloc_jobinfo(&tmp_ptr->qsw_job); + qsw_unpack_jobinfo( tmp_ptr -> qsw_job , (void **) buffer , length ) ; #endif return 0; } @@ -1352,6 +1327,8 @@ void pack_launch_tasks_request_msg ( launch_tasks_request_msg_t * msg , void ** { pack32 ( msg -> job_id , buffer , length ) ; pack32 ( msg -> job_step_id , buffer , length ) ; + pack32 ( msg -> nnodes, buffer, length ) ; + pack32 ( msg -> nprocs, buffer, length ) ; pack32 ( msg -> uid , buffer , length ) ; pack32 ( msg -> srun_node_id , buffer , length ) ; pack_job_credential ( msg -> credential , buffer , length ) ; @@ -1363,7 +1340,7 @@ void pack_launch_tasks_request_msg ( launch_tasks_request_msg_t * msg , void ** slurm_pack_slurm_addr ( & msg -> streams , buffer , length ) ; pack32_array ( msg -> global_task_ids , ( uint16_t ) msg -> tasks_to_launch , buffer , length ) ; #ifdef HAVE_LIBELAN3 - slurm_qsw_pack_jobinfo( msg -> qsw_job , (void ** ) buffer , length) ; + qsw_pack_jobinfo( msg -> qsw_job , (void ** ) buffer , length) ; #endif } @@ -1381,6 +1358,8 @@ int unpack_launch_tasks_request_msg ( launch_tasks_request_msg_t ** msg_ptr , vo unpack32 ( & msg -> job_id , buffer , length ) ; unpack32 ( & msg -> job_step_id , buffer , length ) ; + unpack32 ( & msg -> nnodes, buffer, length ) ; + unpack32 ( & msg -> nprocs, buffer, length ) ; unpack32 ( & msg -> uid , buffer , length ) ; unpack32 ( & msg -> srun_node_id , buffer , length ) ; unpack_job_credential( & msg -> credential , buffer , length ) ; @@ -1392,7 +1371,8 @@ int unpack_launch_tasks_request_msg ( launch_tasks_request_msg_t ** msg_ptr , vo slurm_unpack_slurm_addr_no_alloc ( & msg -> streams , buffer , length ) ; unpack32_array ( & msg -> global_task_ids , & uint16_tmp , buffer , length ) ; #ifdef HAVE_LIBELAN3 - slurm_qsw_unpack_jobinfo(& msg -> qsw_job , (void **) buffer , length ) ; + qsw_alloc_jobinfo(&msg->qsw_job); + qsw_unpack_jobinfo(msg -> qsw_job , (void **) buffer , length ) ; #endif *msg_ptr = msg ; return 0 ; diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 3237ff7c504992e68212327938f556e490655763..17c2da249895a4e514937c55b41c0e66df2c2331 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -152,9 +152,4 @@ int unpack_task_exit_msg ( task_exit_msg_t ** msg_ptr , void ** buffer , uint32_ void pack_job_credential ( slurm_job_credential_t* cred , void ** buffer , uint32_t * length ) ; int unpack_job_credential( slurm_job_credential_t** msg , void ** buffer , uint32_t * length ) ; -#ifdef HAVE_LIBELAN3 -void slurm_qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int * len) ; -int slurm_qsw_unpack_jobinfo(qsw_jobinfo_t * j, void **data, int *len) ; -#endif - #endif diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 6fae1673f1b41b1cc7bd922f567caf6ea08164f3..60bfc43ef8735236234efa30c7cd199f62d69c58 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -501,7 +501,7 @@ dump_job_step_state (struct step_record *step_ptr, void **buf_ptr, int *buf_len) packstr (node_list, buf_ptr, buf_len); xfree (node_list); #ifdef HAVE_LIBELAN3 - qsw_pack_libstate(step_ptr->qsw_job, buf_ptr, buf_len); + qsw_pack_jobinfo(step_ptr->qsw_job, (void **)buf_ptr, buf_len); #endif } @@ -704,7 +704,7 @@ load_job_state ( void ) xfree (node_list); } #ifdef HAVE_LIBELAN3 - qsw_unpack_libstate(step_ptr->qsw_job, buf_ptr, buf_len); + qsw_unpack_jobinfo(step_ptr->qsw_job, buf_ptr, &buffer_size); #endif info ("recovering job step %u.%u", job_id, step_id); unpack16 (&step_flag, &buf_ptr, &buffer_size); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 0275a5c345d030f20f20c816c3a5859403e71db0..51d572651c564d45c81943299cb775e3a30581fd 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -346,7 +346,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) } } if (qsw_setup_jobinfo (step_ptr->qsw_job, nprocs, nodeset, step_ptr->node_bitmap) < 0) - fatal ("step_create: qsw_setup_jobinfo error"); + fatal ("step_create: qsw_setup_jobinfo error %m"); bit_free (nodeset); #endif diff --git a/src/srun/launch.c b/src/srun/launch.c index 14ecede3abce5815a5efca7f2c6ecb5ce35ec21b..c5250b515241a50724dea54f88f22ed8c213e5f3 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -72,7 +72,7 @@ launch(void *arg) #if HAVE_LIBELAN3 msg.qsw_job = job->qsw_job; -#endif +#endif debug("setting iopart to %s:%d", hostname, ntohs(job->ioport)); slurm_set_addr_char(&msg.streams , ntohs(job->ioport), hostname); debug("sending to slurmd port %d", slurm_get_slurmd_port()); diff --git a/src/srun/srun.c b/src/srun/srun.c index 73d3b7e0aab9aa17cac11752dd54ed814a02a33b..8d6f48ae46d1896cca7a511d8b4aa7de6ac31028 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -175,7 +175,7 @@ main(int ac, char **av) } /* job is now overdone, blow this popsicle stand */ - + if (!opt.no_alloc) slurm_complete_job(job->jobid); @@ -284,8 +284,10 @@ create_job_step(job_t *job) req_msg.msg_type = REQUEST_JOB_STEP_CREATE; req_msg.data = &req; - if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) + if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) { error("unable to create job step: %s", slurm_strerror(errno)); + exit(1); + } if (resp_msg.msg_type == RESPONSE_SLURM_RC) { return_code_msg_t *rcmsg = (return_code_msg_t *) resp_msg.data; @@ -330,7 +332,8 @@ sig_thr(void *arg) { job_t *job = (job_t *)arg; sigset_t set; - static time_t last_intr = 0; + time_t last_intr = 0; + bool suddendeath = false; int signo; struct sigaction action; @@ -356,6 +359,7 @@ sig_thr(void *arg) job->state = SRUN_JOB_OVERDONE; pthread_cond_signal(&job->state_cond); pthread_mutex_unlock(&job->state_mutex); + suddendeath = true; } break; default: