diff --git a/src/common/qsw.c b/src/common/qsw.c index eec002e28829c3dac918654d54761c2307ef2989..76757b012dd2b0d8aa493ad38f98ceeefbea0484 100644 --- a/src/common/qsw.c +++ b/src/common/qsw.c @@ -296,7 +296,7 @@ qsw_pack_jobinfo(qsw_jobinfo_t j, void **data, int *len) for (i = 0; i < 4; i++) pack32(j->j_cap.UserKey.Values[i], data, len); pack16(j->j_cap.Type, data, len); - pack16(j->j_cap.Generation, data, len); + pack16(j->j_cap.padding, data, len); pack32(j->j_cap.Version, data, len); pack32(j->j_cap.LowContext, data, len); pack32(j->j_cap.HighContext, data, len); @@ -330,8 +330,8 @@ qsw_unpack_jobinfo(qsw_jobinfo_t j, void **data, int *len) for (i = 0; i < 4; i++) unpack32(&j->j_cap.UserKey.Values[i], data, len); unpack16(&j->j_cap.Type, data, len); - unpack16(&j->j_cap.Generation, data, len); - unpack32(&j->j_cap.Version, data, len); + unpack16(&j->j_cap.padding, data, len); + unpack32(&j->j_cap.Version, data, len); unpack32(&j->j_cap.LowContext, data, len); unpack32(&j->j_cap.HighContext, data, len); unpack32(&j->j_cap.MyContext, data, len); @@ -590,11 +590,29 @@ qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid) goto fail; } #else - /* see qsw gnat sw-elan/4334: elan3_control_open can return -1 */ - if ((jobinfo->j_ctx = elan3_control_open(0)) == NULL - || jobinfo->j_ctx == (void *)-1) { - slurm_seterrno(EELAN3CONTROL); - goto fail; + int i, nrails; + nrails = elan3_nrails(&jobinfo->j_cap); + + for (i = 0; i < nrails; i++) { + + ELAN3_CTX *ctx; + + /* see qsw gnat sw-elan/4334: elan3_control_open can ret -1 */ + if ((ctx = elan3_control_open(i)) == NULL + || ctx == (void *)-1) { + slurm_seterrno(EELAN3CONTROL); + goto fail; + } + + /* make cap known via rms_getcap/rms_ncaps to members + * of this prgnum */ + if (elan3_create(ctx, &jobinfo->j_cap) < 0) { + /* XXX masking errno value better than not knowing + * which function failed? */ + error("elan3_create(%d): %m", i); + slurm_seterrno(EELAN3CREATE); + goto fail; + } } #endif /* associate this process and its children with prgnum */ @@ -610,12 +628,6 @@ qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid) goto fail; } - /* make cap known via rms_getcap/rms_ncaps to members of this prgnum */ - if (elan3_create(jobinfo->j_ctx, &jobinfo->j_cap) < 0) { - /* XXX masking errno value better than not knowing which function failed? */ - slurm_seterrno(EELAN3CREATE); - goto fail; - } if (rms_prgaddcap(jobinfo->j_prognum, 0, &jobinfo->j_cap) < 0) { /* translate errno values to more descriptive ones */ switch (errno) { @@ -682,7 +694,7 @@ qsw_getnodeid(void) int nodeid = -1; if (ctx) { - nodeid = ctx->devinfo.NodeId; + nodeid = ctx->devinfo.Position.NodeId; #if USE_OLD_LIBELAN _elan3_fini(ctx); #else @@ -818,6 +830,7 @@ void qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo) { ELAN_CAPABILITY *cap; + char str[8192]; assert(jobinfo->j_magic == QSW_JOBINFO_MAGIC); @@ -825,12 +838,15 @@ qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo) fprintf(fp, "prognum=%d\n", jobinfo->j_prognum); cap = &jobinfo->j_cap; + /* use elan3_capability_string as a shorter alternative for now */ + fprintf(fp, "%s\n", elan3_capability_string(cap, str)); +#if 0 fprintf(fp, "cap.UserKey=%8.8x.%8.8x.%8.8x.%8.8x\n", cap->UserKey.Values[0], cap->UserKey.Values[1], cap->UserKey.Values[2], cap->UserKey.Values[3]); - fprintf(fp, "cap.Version=%d\n", cap->Version); + /*fprintf(fp, "cap.Version=%d\n", cap->Version);*/ fprintf(fp, "cap.Type=0x%hx\n", cap->Type); - fprintf(fp, "cap.Generation=%hd\n", cap->Generation); + fprintf(fp, "cap.padding=%hd\n", cap->padding); fprintf(fp, "cap.LowContext=%d\n", cap->LowContext); fprintf(fp, "cap.HighContext=%d\n", cap->HighContext); fprintf(fp, "cap.MyContext=%d\n", cap->MyContext); @@ -840,5 +856,6 @@ qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo) fprintf(fp, "cap.Railmask=0x%x\n", cap->RailMask); fprintf(fp, "cap.Bitmap="); _print_capbitmap(fp, cap); +#endif fprintf(fp, "\n------------------\n"); } diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 8e952bfb0dbb1d4243a6a237bdc7270d0247b862..09386572c0b9c6f817e2ed7f1173fb67396e5d8f 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -86,10 +86,8 @@ static slurm_errtab_t slurm_errtab[] = { { ENOSLURM, "Out of slurm" }, /* oh no! */ { EBADMAGIC_QSWLIBSTATE,"Bad magic in QSW libstate" }, { EBADMAGIC_QSWJOBINFO, "Bad magic in QSW jobinfo" }, - { EINVAL_PRGCREATE, "Program identifier in use or number of CPUs invalid" - }, - { ECHILD_PRGDESTROY, "Processes belonging to this program are still runnin -g" }, + { EINVAL_PRGCREATE, "Program identifier in use or number of CPUs invalid" }, + { ECHILD_PRGDESTROY, "Processes belonging to this program are still running" }, { EEXIST_PRGDESTROY, "Program identifier does not exist" }, { EELAN3INIT, "Too many processes using Elan or mapping failure" }, { EELAN3CONTROL, "Could not open elan3 control device" }, diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 216e372f064ea2f77d452122c0434574e7312bfd..5ad4c1156d57058cf103c5b3c5b0f6d4df3a0eb9 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -92,7 +92,7 @@ int read_slurm_port_config ( ) slurm_spec_file = fopen (SLURM_CONFIG_FILE, "r"); if (slurm_spec_file == NULL) { - error ( "read_slurm_conf error %d opening file %s", + error ( "read_slurm_conf error %d opening file %s: %m", errno, SLURM_CONFIG_FILE); return SLURM_ERROR ; } @@ -225,14 +225,10 @@ slurm_fd slurm_open_controller_conn ( ) /* try to send to primary first then secondary */ if ( ( connection_fd = slurm_open_msg_conn ( & proto_conf -> primary_controller ) ) == SLURM_SOCKET_ERROR ) { - int local_errno = errno ; - debug ( "Open connection to primary controller failed errno: %i", local_errno ) ; + debug ( "Open connection to primary controller failed: %m" ) ; if ( ( connection_fd = slurm_open_msg_conn ( & proto_conf -> secondary_controller ) ) == SLURM_SOCKET_ERROR ) - { - int local_errno = errno ; - debug ( "Open connection to secondary controller failed errno: %i", local_errno ) ; - } + debug ( "Open connection to secondary controller failed: %m" ) ; } return connection_fd ; } @@ -277,8 +273,7 @@ int slurm_receive_msg ( slurm_fd open_fd , slurm_msg_t * msg ) if ( ( rc = _slurm_msg_recvfrom ( open_fd , buffer , receive_len, SLURM_PROTOCOL_NO_SEND_RECV_FLAGS , & (msg)->address ) ) == SLURM_SOCKET_ERROR ) { - int local_errno = errno ; - debug ( "slurm_receive_msg: Error receiving msg socket: %m errno %i", local_errno ) ; + debug ( "Error receiving msg socket: %m") ; return rc ; } @@ -337,14 +332,10 @@ int slurm_send_controller_msg ( slurm_fd open_fd , slurm_msg_t * msg ) msg -> address = proto_conf -> primary_controller ; if ( (rc = slurm_send_node_msg ( open_fd , msg ) ) == SLURM_SOCKET_ERROR ) { - int local_errno = errno ; - debug ( "Send message to primary controller failed errno: %i", local_errno ) ; + debug ( "Send message to primary controller failed: %m" ) ; msg -> address = proto_conf -> secondary_controller ; if ( (rc = slurm_send_node_msg ( open_fd , msg ) ) == SLURM_SOCKET_ERROR ) - { - int local_errno = errno ; - debug ( "Send messge to secondary controller failed errno: %i", local_errno ) ; - } + debug ( "Send messge to secondary controller failed: %m" ) ; } return rc ; } @@ -399,10 +390,7 @@ int slurm_send_node_msg ( slurm_fd open_fd , slurm_msg_t * msg ) /* send msg */ if ( ( rc = _slurm_msg_sendto ( open_fd , buf_temp , SLURM_PROTOCOL_MAX_MESSAGE_BUFFER_SIZE - pack_len , SLURM_PROTOCOL_NO_SEND_RECV_FLAGS , &msg->address ) ) == SLURM_SOCKET_ERROR ) - { - int local_errno = errno ; - debug ( "Error sending msg socket: errno %i", local_errno ) ; - } + debug ( "Error sending msg socket: %m" ) ; return rc ; } diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 32ee9bbd30757f9aae2fc3c5aba4ba81510158f2..b9ed180d0e6f08a7fa0885e6c3d992c6bc3b5cdc 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -548,7 +548,10 @@ int unpack_resource_allocation_and_run_response_msg ( resource_allocation_and_ru unpack_job_credential( &tmp_ptr->credentials, ( void ** ) buffer , length ) ; #ifdef HAVE_LIBELAN3 qsw_alloc_jobinfo(&tmp_ptr->qsw_job); - qsw_unpack_jobinfo(tmp_ptr->qsw_job , (void **) buffer , length ) ; + if (qsw_unpack_jobinfo(tmp_ptr->qsw_job, (void **) buffer, length) < 0) { + error("qsw_unpack_jobinfo: %m"); + return -1; + } #endif *msg = tmp_ptr ; @@ -1378,10 +1381,15 @@ int unpack_launch_tasks_request_msg ( launch_tasks_request_msg_t ** msg_ptr , vo slurm_unpack_slurm_addr_no_alloc ( & msg -> response_addr , buffer , length ) ; slurm_unpack_slurm_addr_no_alloc ( & msg -> streams , buffer , length ) ; unpack32_array ( & msg -> global_task_ids , & uint16_tmp , buffer , length ) ; + #ifdef HAVE_LIBELAN3 qsw_alloc_jobinfo(&msg->qsw_job); - qsw_unpack_jobinfo(msg -> qsw_job , (void **) buffer , length ) ; + if (qsw_unpack_jobinfo(msg->qsw_job, (void **) buffer, length) < 0) { + error("qsw_unpack_jobinfo: %m"); + return -1; + } #endif + *msg_ptr = msg ; return 0 ; } diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index afef6fe8749bb8b0a724f646f54a25fa2397cfe3..7a08955ccab052416763da14989b9a9e97ca2a12 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -419,7 +419,7 @@ slurm_fd _slurm_listen_stream ( slurm_addr * slurm_address ) const int one = 1; if ( ( connection_fd =_slurm_create_socket ( SLURM_STREAM ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error creating slurm stream socket: errno %i", errno ) ; + debug ( "Error creating slurm stream socket: %m" ) ; return connection_fd ; } @@ -431,13 +431,13 @@ slurm_fd _slurm_listen_stream ( slurm_addr * slurm_address ) if ( ( rc = _slurm_bind ( connection_fd , ( struct sockaddr const * ) slurm_address , sizeof ( slurm_addr ) ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error binding slurm stream socket: errno %i" , errno ) ; + debug ( "Error binding slurm stream socket: %m" ) ; goto error_cleanup ; } if ( ( rc = _slurm_listen ( connection_fd , SLURM_PROTOCOL_DEFAULT_LISTEN_BACKLOG ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error listening on slurm stream socket: errno %i" , errno ) ; + debug ( "Error listening on slurm stream socket: %m" ) ; goto error_cleanup ; } @@ -456,7 +456,7 @@ slurm_fd _slurm_accept_stream ( slurm_fd open_fd , slurm_addr * slurm_address ) slurm_fd connection_fd ; if ( ( connection_fd = _slurm_accept ( open_fd , ( struct sockaddr * ) slurm_address , & addr_len ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error accepting slurm stream socket: errno %i", errno ) ; + debug ( "Error accepting slurm stream socket: %m" ) ; } return connection_fd ; @@ -468,13 +468,13 @@ slurm_fd _slurm_open_stream ( slurm_addr * slurm_address ) slurm_fd connection_fd ; if ( ( connection_fd =_slurm_create_socket ( SLURM_STREAM ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error creating slurm stream socket: errno %i", errno ) ; + debug ( "Error creating slurm stream socket: %m" ) ; return connection_fd ; } if ( ( rc = _slurm_connect ( connection_fd , ( struct sockaddr const * ) slurm_address , sizeof ( slurm_addr ) ) ) == SLURM_SOCKET_ERROR ) { - debug ( "Error connecting on slurm stream socket: errno %i" , errno ) ; + debug ( "Error connecting on slurm stream socket: %m" ) ; goto error_cleanup ; }