diff --git a/src/api/allocate.c b/src/api/allocate.c index bc2395441575a559b0b9c9174aac3ecfe2c76013..89687a97a93402e544dda86925d45c9e4170222c 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -298,3 +298,69 @@ slurm_job_step_create (job_step_create_request_msg_t * slurm_step_alloc_req_msg, return SLURM_PROTOCOL_SUCCESS ; } + +/* slurm_confirm_allocation - confirm an existing resource allocation */ +int +slurm_confirm_allocation (old_job_alloc_msg_t * job_desc_msg , resource_allocation_response_msg_t ** slurm_alloc_msg ) +{ + int msg_size ; + int rc ; + slurm_fd sockfd ; + slurm_msg_t request_msg ; + slurm_msg_t response_msg ; + return_code_msg_t * slurm_rc_msg ; + + /* init message connection for message communication with controller */ + if ( ( sockfd = slurm_open_controller_conn ( ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_CONNECTION_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* send request message */ + request_msg . msg_type = REQUEST_OLD_JOB_RESOURCE_ALLOCATION ; + request_msg . data = job_desc_msg ; + if ( ( rc = slurm_send_controller_msg ( sockfd , & request_msg ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_SEND_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* receive message */ + if ( ( msg_size = slurm_receive_msg ( sockfd , & response_msg ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_RECEIVE_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* shutdown message connection */ + if ( ( rc = slurm_shutdown_msg_conn ( sockfd ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_SHUTDOWN_ERROR ); + return SLURM_SOCKET_ERROR ; + } + if ( msg_size ) + return msg_size; + + switch ( response_msg . msg_type ) + { + case RESPONSE_SLURM_RC: + slurm_rc_msg = ( return_code_msg_t * ) response_msg . data ; + rc = slurm_rc_msg->return_code; + slurm_free_return_code_msg ( slurm_rc_msg ); + if (rc) { + slurm_seterrno ( rc ); + return SLURM_PROTOCOL_ERROR; + } + *slurm_alloc_msg = NULL; + break ; + case RESPONSE_RESOURCE_ALLOCATION: + /* Calling methos is responsible to free this memory */ + *slurm_alloc_msg = ( resource_allocation_response_msg_t * ) response_msg . data ; + return SLURM_PROTOCOL_SUCCESS; + break ; + default: + slurm_seterrno ( SLURM_UNEXPECTED_MSG_ERROR ); + return SLURM_PROTOCOL_ERROR; + break ; + } + + return SLURM_PROTOCOL_SUCCESS ; +} + diff --git a/src/api/slurm.h b/src/api/slurm.h index 98edf070d73f7ced1e8c0d38bcbe34ddb874844d..d8fbd6c5f64f5fc19d258fddffffa79dd0ac223b 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -39,6 +39,7 @@ extern void make_time_str (time_t *time, char *string); */ extern int slurm_allocate_resources (job_desc_msg_t * job_desc_msg , resource_allocation_response_msg_t ** job_alloc_resp_msg, int immediate ) ; extern int slurm_allocate_resources_and_run (job_desc_msg_t * job_desc_msg , resource_allocation_and_run_response_msg_t ** slurm_alloc_msg ); +extern int slurm_confirm_allocation (old_job_alloc_msg_t * job_desc_msg , resource_allocation_response_msg_t ** slurm_alloc_msg ) ; extern int slurm_cancel_job (uint32_t job_id); extern int slurm_cancel_job_step (uint32_t job_id, uint32_t step_id); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 2b7cad9a9966f5639fd61b876b46a5e0c37023d5..0cc6133c27c358be5ed75963fbd9dc98b0f6e27d 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -149,6 +149,7 @@ typedef enum { REQUEST_REVOKE_JOB_CREDENTIAL, REQUEST_ALLOCATION_AND_RUN_JOB_STEP, RESPONSE_ALLOCATION_AND_RUN_JOB_STEP, + REQUEST_OLD_JOB_RESOURCE_ALLOCATION, REQUEST_JOB_STEP_CREATE = 5001, RESPONSE_JOB_STEP_CREATE, @@ -495,6 +496,10 @@ typedef struct reattach_tasks_streams_msg { uint32_t *global_task_ids; } reattach_tasks_streams_msg_t; +typedef struct old_job_alloc_msg { + uint32_t job_id; + uint32_t uid; +} old_job_alloc_msg_t; typedef struct resource_allocation_response_msg { uint32_t job_id; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 906cc4c2b95b980c3632e1d4784c559bdc931ec0..fbd82f388c01cafde8d457d953d9a7cd5d54656d 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -141,6 +141,9 @@ int pack_msg ( slurm_msg_t const * msg , Buf buffer ) case REQUEST_ALLOCATION_AND_RUN_JOB_STEP : pack_job_desc ( (job_desc_msg_t * ) msg -> data , buffer ) ; break ; + case REQUEST_OLD_JOB_RESOURCE_ALLOCATION : + pack_old_job_desc ( (old_job_alloc_msg_t * ) msg -> data , buffer ) ; + break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : case REQUEST_SHUTDOWN_IMMEDIATE : @@ -287,6 +290,9 @@ int unpack_msg ( slurm_msg_t * msg , Buf buffer ) case REQUEST_ALLOCATION_AND_RUN_JOB_STEP : unpack_job_desc ( ( job_desc_msg_t **) & ( msg-> data ), buffer ) ; break ; + case REQUEST_OLD_JOB_RESOURCE_ALLOCATION : + unpack_old_job_desc ( (old_job_alloc_msg_t * ) & ( msg -> data ), buffer ) ; + break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : case REQUEST_SHUTDOWN_IMMEDIATE : @@ -1110,8 +1116,6 @@ int unpack_slurm_ctl_conf ( slurm_ctl_conf_info_msg_t **build_buffer_ptr, Buf bu void pack_job_desc ( job_desc_msg_t * job_desc_ptr, Buf buffer ) { /* load the data values */ - /* unpack timestamp of snapshot */ - pack16 (job_desc_ptr->contiguous, buffer); pack16 (job_desc_ptr->kill_on_node_fail, buffer); packstr (job_desc_ptr->features, buffer); @@ -1162,7 +1166,6 @@ int unpack_job_desc ( job_desc_msg_t **job_desc_buffer_ptr, Buf buffer ) } /* load the data values */ - /* unpack timestamp of snapshot */ unpack16 (&job_desc_ptr->contiguous, buffer); unpack16 (&job_desc_ptr->kill_on_node_fail, buffer); @@ -1197,6 +1200,32 @@ int unpack_job_desc ( job_desc_msg_t **job_desc_buffer_ptr, Buf buffer ) return 0 ; } +void pack_old_job_desc ( old_job_alloc_msg_t * job_desc_ptr, Buf buffer ) +{ + /* load the data values */ + pack32 (job_desc_ptr->job_id, buffer); + pack32 (job_desc_ptr->uid, buffer); +} + +int unpack_old_job_desc ( old_job_alloc_msg_t **job_desc_buffer_ptr, Buf buffer ) +{ + old_job_alloc_msg_t * job_desc_ptr ; + + /* alloc memory for structure */ + job_desc_ptr = xmalloc ( sizeof ( old_job_alloc_msg_t ) ) ; + if (job_desc_ptr== NULL) + { + *job_desc_buffer_ptr = NULL ; + return ENOMEM ; + } + + /* load the data values */ + unpack32 (&job_desc_ptr->job_id, buffer); + unpack32 (&job_desc_ptr->uid, buffer); + *job_desc_buffer_ptr = job_desc_ptr ; + return 0 ; +} + void pack_last_update ( last_update_msg_t * msg , Buf buffer ) { pack_time ( msg -> last_update , buffer ) ; diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 8c6588d01d49ec52d7310e446867a69a694b52fe..e20cff85402e64bd4ec048b69d98ede054f9cef9 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -64,6 +64,9 @@ int unpack_node_registration_status_msg ( slurm_node_registration_status_msg_t * void pack_job_desc ( job_desc_msg_t *job_desc_msg_ptr, Buf buffer ); int unpack_job_desc ( job_desc_msg_t **job_desc_msg_ptr, Buf buffer ); +void pack_old_job_desc ( old_job_alloc_msg_t * job_desc_ptr, Buf buffer ); +int unpack_old_job_desc ( old_job_alloc_msg_t **job_desc_buffer_ptr, Buf buffer ); + void pack_last_update ( last_update_msg_t * msg , Buf buffer ); int unpack_last_update ( last_update_msg_t ** msg , Buf buffer ); diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index 4b13d308d8e11d19a1e24c17b79b8be3b6f3c12e..2adfee9384298be2fab79afed0d8d4dd23b327db 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -240,7 +240,10 @@ ssize_t _slurm_msg_sendto_timeout ( slurm_fd open_fd, char *buffer , size_t size while ( true ) { - if ( ( send_len = _slurm_send_timeout ( open_fd , &usize , sizeof ( uint32_t ) , SLURM_PROTOCOL_NO_SEND_RECV_FLAGS , timeout ) ) == SLURM_PROTOCOL_ERROR ) + if ( ( send_len = _slurm_send_timeout ( open_fd , + (char *) &usize , sizeof ( uint32_t ) , + SLURM_PROTOCOL_NO_SEND_RECV_FLAGS , timeout ) ) == + SLURM_PROTOCOL_ERROR ) { if ( errno == EINTR ) continue ; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index a1ee65ad78992f29751c0a87b039b202442278d0..8f95a112119049b886bf217f74eba8d3c2848eed 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -100,6 +100,7 @@ inline static void slurm_rpc_job_step_create( slurm_msg_t* msg ) ; inline static void slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) ; inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ; inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ; +inline static void slurm_rpc_old_job_alloc ( slurm_msg_t * msg ) ; inline static void slurm_rpc_ping ( slurm_msg_t * msg ) ; inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ; inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg ); @@ -593,6 +594,9 @@ slurmctld_req ( slurm_msg_t * msg ) slurm_rpc_allocate_and_run ( msg ); slurm_free_job_desc_msg ( msg -> data ) ; break; + case REQUEST_OLD_JOB_RESOURCE_ALLOCATION : + slurm_rpc_old_job_alloc ( msg ); + break; case REQUEST_JOB_WILL_RUN : slurm_rpc_job_will_run ( msg -> data ) ; slurm_free_job_desc_msg ( msg -> data ) ; @@ -1374,6 +1378,68 @@ slurm_rpc_allocate_and_run ( slurm_msg_t * msg ) } } +/* slurm_rpc_old_job_alloc - process RPC to get details on existing job */ +void slurm_rpc_old_job_alloc ( slurm_msg_t * msg ) +{ + int error_code = 0; + slurm_msg_t response_msg ; + clock_t start_time; + old_job_alloc_msg_t * job_desc_msg = ( old_job_alloc_msg_t * ) msg-> data ; + char * node_list_ptr = NULL; + uint16_t num_cpu_groups = 0; + uint32_t * cpus_per_node = NULL, * cpu_count_reps = NULL; + resource_allocation_response_msg_t alloc_msg ; + /* Locks: Read job, read node */ + slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + uid_t uid = 0; + + start_time = clock (); + debug ("Processing RPC: REQUEST_OLD_JOB_RESOURCE_ALLOCATION"); + + /* do RPC call */ +#ifdef HAVE_AUTHD + uid = slurm_auth_uid (msg->cred); + if ( (uid != job_desc_msg->uid) && + (uid != 0) && (uid != getuid ()) ) { + error_code = ESLURM_USER_ID_MISSING; + error ("Security violation, RESOURCE_ALLOCATE from uid %u", (unsigned int) uid); + } +#endif + if (error_code == 0) { + lock_slurmctld (job_read_lock); + error_code = old_job_info (job_desc_msg->uid, job_desc_msg->job_id, + &node_list_ptr, &num_cpu_groups, &cpus_per_node, &cpu_count_reps ); + unlock_slurmctld (job_read_lock); + } + + /* return result */ + if (error_code) + { + info ("slurm_rpc_old_job_alloc error %d getting info, job=%u, uid=%u, time=%ld", + error_code, job_desc_msg->job_id, job_desc_msg->uid, + (long) (clock () - start_time)); + slurm_send_rc_msg ( msg , error_code ); + } + else + { + info ("slurm_rpc_old_job_alloc job=%u has nodes %s, time=%ld", + job_desc_msg->job_id, node_list_ptr, + (long) (clock () - start_time)); + + /* send job_ID and node_name_ptr */ + + alloc_msg . job_id = job_desc_msg->job_id ; + alloc_msg . node_list = node_list_ptr ; + alloc_msg . num_cpu_groups = num_cpu_groups; + alloc_msg . cpus_per_node = cpus_per_node; + alloc_msg . cpu_count_reps = cpu_count_reps; + response_msg . msg_type = RESPONSE_RESOURCE_ALLOCATION ; + response_msg . data = & alloc_msg ; + + slurm_send_node_msg ( msg->conn_fd , & response_msg ) ; + (void) dump_all_job_state ( ); + } +} /* slurm_rpc_job_will_run - process RPC to determine if job with given configuration can be initiated */ void slurm_rpc_job_will_run ( slurm_msg_t * msg ) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 8463fe7e0e3063ad9e3ba438cced0d65cd685350..6cfd89c70d08023c91923208460e193f15318450 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2519,3 +2519,29 @@ signal_job_on_node (uint32_t job_id, uint16_t step_id, int signum, char *node_na signum, job_id, step_id, node_name); error ("CODE DEVELOPMENT NEEDED HERE"); } + + +/* old_job_info - get details about an existing job allocation */ +int +old_job_info (uint32_t uid, uint32_t job_id, char **node_list, + uint16_t * num_cpu_groups, uint32_t ** cpus_per_node, uint32_t ** cpu_count_reps) +{ + struct job_record *job_ptr; + + job_ptr = find_job_record (job_id); + if (job_ptr == NULL) + return ESLURM_INVALID_JOB_ID; + if ((uid != 0) && (job_ptr->user_id != uid)) + return ESLURM_ACCESS_DENIED; + if ((job_ptr->job_state != JOB_STAGE_IN) && + (job_ptr->job_state != JOB_RUNNING)) + return ESLURM_ALREADY_DONE; + + node_list[0] = job_ptr->nodes; + *num_cpu_groups = job_ptr->num_cpu_groups; + cpus_per_node[0] = job_ptr->cpus_per_node; + cpu_count_reps[0] = job_ptr->cpu_count_reps; + return SLURM_SUCCESS; +} + + diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index ee24914555ba05631009ff12079a5e264426c8a0..4a12cab3c3eb88599633f94703ea3e23950c5170 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -434,6 +434,10 @@ extern void node_did_resp (char *name); /* node_not_resp - record that the specified node is not responding */ extern void node_not_resp (char *name); +/* old_job_info - get details about an existing job allocation */ +extern int old_job_info (uint32_t uid, uint32_t job_id, char **node_list, + uint16_t * num_cpu_groups, uint32_t ** cpus_per_node, uint32_t ** cpu_count_reps); + /* * pack_all_jobs - dump all job information for all jobs in * machine independent form (for network transmission)