diff --git a/src/api/slurm.h b/src/api/slurm.h index 589cf1fb233a6593221f9042a5d5313b6f7d28df..0c9c078c85d8da8694eba4e7570d483c5f24120d 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -129,23 +129,9 @@ extern int slurm_load_node (time_t update_time, node_info_msg_t **node_info_msg_ */ extern int slurm_load_partitions (time_t update_time, partition_info_msg_t **part_buffer_ptr); -/* - * slurm_submit - submit/queue a job with supplied contraints. - * input: spec - specification of the job's constraints - * job_id - place to store id of submitted job - * output: job_id - the job's id - * returns 0 if no error, EINVAL if the request is invalid - * NOTE: required specification include: Script=<script_path_name> - * User=<uid> - * NOTE: optional specifications include: Contiguous=<YES|NO> - * Distribution=<BLOCK|CYCLE> Features=<features> Groups=<groups> - * JobId=<id> JobName=<name> Key=<key> MinProcs=<count> - * MinRealMemory=<MB> MinTmpDisk=<MB> Partition=<part_name> - * Priority=<integer> ProcsPerTask=<count> ReqNodes=<node_list> - * Shared=<YES|NO> TimeLimit=<minutes> TotalNodes=<count> - * TotalProcs=<count> Immediate=<YES|NO> - */ -extern int slurm_submit_batch_job (job_desc_msg_t * job_desc_msg ); +/* slurm_submit_job - load the supplied node information buffer if changed */ +extern int slurm_submit_batch_job (job_desc_msg_t * job_desc_msg, + submit_response_msg_t ** slurm_alloc_msg ); /* * slurm_will_run - determine if a job would execute immediately diff --git a/src/api/submit.c b/src/api/submit.c index 484a39db2727302b2dd43674978a0b17d48a9336..d5d06977cfd5910cafdaf1751d6333e9f0bd924a 100644 --- a/src/api/submit.c +++ b/src/api/submit.c @@ -20,7 +20,7 @@ /* slurm_submit_job - load the supplied node information buffer if changed */ int -slurm_submit_batch_job (job_desc_msg_t * job_desc_msg ) +slurm_submit_batch_job (job_desc_msg_t * job_desc_msg, submit_response_msg_t ** slurm_alloc_msg ) { int msg_size ; int rc ; @@ -28,7 +28,6 @@ slurm_submit_batch_job (job_desc_msg_t * job_desc_msg ) slurm_msg_t request_msg ; slurm_msg_t response_msg ; return_code_msg_t * slurm_rc_msg ; - resource_allocation_response_msg_t * slurm_aloc_resp_msg; /* init message connection for message communication with controller */ if ( ( sockfd = slurm_open_controller_conn ( ) ) == SLURM_SOCKET_ERROR ) @@ -55,9 +54,8 @@ slurm_submit_batch_job (job_desc_msg_t * job_desc_msg ) return (int) slurm_rc_msg->return_code ; break ; case RESPONSE_SUBMIT_BATCH_JOB: - slurm_aloc_resp_msg = ( resource_allocation_response_msg_t * ) response_msg . data ; - job_desc_msg->job_id = slurm_aloc_resp_msg->job_id; - return 0; + *slurm_alloc_msg = ( submit_response_msg_t * ) response_msg . data ; + return SLURM_SUCCESS; break; default: return SLURM_UNEXPECTED_MSG_ERROR ; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b959cd8ef926e41982bf0863ca8e69664d8ec029..4cfa822b71253ea0a8baca9d9abb6ac1d6f943d3 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -215,6 +215,11 @@ typedef struct resource_allocation_response_msg int32_t* cpu_count_reps; } resource_allocation_response_msg_t ; +typedef struct submit_response_msg +{ + uint32_t job_id; +} submit_response_msg_t ; + typedef struct job_desc_msg { /* Job descriptor for submit, allocate, and update requests */ uint16_t contiguous; /* 1 if job requires contiguous nodes, 0 otherwise, * default=0 */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index efe4e03695fde8e03e2025468f82e82ae7b6d8a3..4f942e54821807651bca34e9bb0daf78adfdf1b3 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -82,6 +82,9 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) case REQUEST_RECONFIGURE : /* Message contains no body/information */ break ; + case RESPONSE_SUBMIT_BATCH_JOB: + pack_submit_response_msg ( ( submit_response_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; + break ; case RESPONSE_RESOURCE_ALLOCATION : case RESPONSE_IMMEDIATE_RESOURCE_ALLOCATION : case RESPONSE_JOB_WILL_RUN : @@ -145,7 +148,7 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) pack_return_code ( ( return_code_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; break; default : - debug ( "No pack method for msg type %i", msg -> msg_type ) ; + error ( "No pack method for msg type %i", msg -> msg_type ) ; return EINVAL ; break; @@ -196,12 +199,14 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len ) case REQUEST_RECONFIGURE : /* Message contains no body/information */ break ; + case RESPONSE_SUBMIT_BATCH_JOB : + unpack_submit_response_msg ( ( submit_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ; + break ; case RESPONSE_RESOURCE_ALLOCATION : case RESPONSE_IMMEDIATE_RESOURCE_ALLOCATION : case RESPONSE_JOB_WILL_RUN : unpack_resource_allocation_response_msg ( ( resource_allocation_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ; break ; - case REQUEST_UPDATE_NODE : unpack_update_node_msg ( ( update_node_msg_t ** ) & ( msg-> data ) , ( void ** ) buffer , buf_len ) ; @@ -351,6 +356,24 @@ int unpack_resource_allocation_response_msg ( resource_allocation_response_msg_t return 0 ; } +void pack_submit_response_msg ( submit_response_msg_t * msg, void ** buffer , int * length ) +{ + pack32 ( msg->job_id , ( void ** ) buffer , length ) ; +} + +int unpack_submit_response_msg ( submit_response_msg_t ** msg , void ** buffer , int * length ) +{ + submit_response_msg_t * tmp_ptr ; + /* alloc memory for structure */ + tmp_ptr = xmalloc ( sizeof ( submit_response_msg_t ) ) ; + if (tmp_ptr == NULL) + return ENOMEM; + + /* load the data values */ + unpack32 ( & tmp_ptr -> job_id , ( void ** ) buffer , length ) ; + *msg = tmp_ptr ; + return 0 ; +} void pack_node_info_msg ( slurm_msg_t * msg, void ** buf_ptr , int * buffer_size ) { assert ( msg != NULL ); diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 8a392ea33a7cdaa6428d74e6ead717ef54b2575c..a48bb6c52912c072b3423043b8aaf2004ee8f021 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -61,6 +61,9 @@ int unpack_node_table ( node_table_msg_t * node , void ** buf_ptr , int * buffer void pack_resource_allocation_response_msg ( resource_allocation_response_msg_t * msg, void ** buffer , int * length ); int unpack_resource_allocation_response_msg ( resource_allocation_response_msg_t ** msg , void ** buffer , int * length ); +void pack_submit_response_msg ( submit_response_msg_t * msg, void ** buffer , int * length ); +int unpack_submit_response_msg ( submit_response_msg_t ** msg , void ** buffer , int * length ); + void pack_update_node_msg ( update_node_msg_t * msg, void ** buffer , uint32_t * length ); int unpack_update_node_msg ( update_node_msg_t ** msg , void ** buffer , uint32_t * length ); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index ba990bfee8a6fee0bd630debc8393c03af7baf39..843d15cb2cfed61e8060bdf7eef60b10839e2c68 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -448,7 +448,7 @@ slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) struct job_record *job_rec_ptr; uint32_t job_id ; slurm_msg_t response_msg ; - job_id_msg_t job_id_msg ; + submit_response_msg_t submit_msg ; job_desc_msg_t * job_desc_msg = ( job_desc_msg_t * ) msg-> data ; start_time = clock (); @@ -469,9 +469,9 @@ slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) info ("slurmctld_req: job_submit success for id=%u, time=%ld", job_id, (long) (clock () - start_time)); /* send job_ID */ - job_id_msg . job_id = job_id ; + submit_msg . job_id = job_id ; response_msg . msg_type = RESPONSE_SUBMIT_BATCH_JOB ; - response_msg . data = & job_id_msg ; + response_msg . data = & submit_msg ; slurm_send_node_msg ( msg->conn_fd , & response_msg ) ; } schedule(); diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 1cc34faec2d3026f012f808c70950f0fb840fe59..e10231427bf1e109574f6d3015d7e3f946a64030 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -107,22 +107,22 @@ schedule() } if (j < failed_part_cnt) continue; error_code = select_nodes(job_ptr, 0); - if (error_code == EAGAIN) { + if (error_code == ESLURM_NODES_BUSY) { xrealloc(failed_parts, (failed_part_cnt+1)*sizeof(struct part_record *)); failed_parts[failed_part_cnt++] = job_ptr->part_ptr; } - else if (error_code == EINVAL) { + else if (error_code == SLURM_SUCCESS) { /* job initiated */ + last_job_update = time (NULL); + info ("schedule: job_id %u on nodes %s", + job_ptr->job_id, job_ptr->nodes); + } + else { last_job_update = time (NULL); job_ptr->job_state = JOB_FAILED; job_ptr->start_time = job_ptr->end_time = time(NULL); delete_job_details(job_ptr); } - else { /* job initiated */ - last_job_update = time (NULL); - info ("schedule: job_id %u on nodes %s", - job_ptr->job_id, job_ptr->nodes); - } } if (failed_parts) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 66d950c1bac151517e4c0dce8e82663f4d829e61..c3fd4acb93b4cb0f6e27551f17b2af45ed9190ae 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -250,11 +250,10 @@ main (int argc, char *argv[]) * input: bitmap - bitmap pointer * node_list - place to put node list * output: node_list - set to node list or NULL on error - * returns 0 if no error, errno otherwise * globals: node_record_table_ptr - pointer to node table * NOTE: the caller must xfree the memory at node_list when no longer required */ -int +void bitmap2node_name (bitstr_t *bitmap, char **node_list) { int node_list_size, i; @@ -344,7 +343,6 @@ bitmap2node_name (bitstr_t *bitmap, char **node_list) strcat (node_list[0], last_suffix); } xrealloc (node_list[0], strlen (node_list[0]) + 1); - return 0; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 99e15f9326dd513c3d16f571a86ff185906e60f9..702e44e55cf8713c689f2c26a5b8eb4425163f18 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -698,6 +698,7 @@ select_nodes (struct job_record *job_ptr, int test_only) struct part_record *part_ptr; int tmp_feature, check_node_config; + error_code = SLURM_SUCCESS; req_bitmap = scratch_bitmap = NULL; config_record_iterator = (ListIterator) NULL; node_set_ptr = NULL; @@ -849,11 +850,7 @@ select_nodes (struct job_record *job_ptr, int test_only) } /* assign the nodes and stage_in the job */ - error_code = bitmap2node_name (req_bitmap, &(job_ptr->nodes)); - if (error_code) { - error ("bitmap2node_name error %d", error_code); - goto cleanup; - } + bitmap2node_name (req_bitmap, &(job_ptr->nodes)); build_node_list (req_bitmap, &job_ptr->details->node_list, &job_ptr->details->total_procs); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index b26c5a5ab762c729983fa909bf4376ef81d3d3d6..b430c70f18a0e3f2414154e370d711425b0f2171 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -203,11 +203,10 @@ extern void allocate_nodes (unsigned *bitmap); * input: bitmap - bitmap pointer * node_list - place to put node list * output: node_list - set to node list or null on error - * returns 0 if no error, otherwise einval or enomem * NOTE: consider returning the node list as a regular expression if helpful * NOTE: the caller must free memory at node_list when no longer required */ -extern int bitmap2node_name (bitstr_t *bitmap, char **node_list); +extern void bitmap2node_name (bitstr_t *bitmap, char **node_list); /* * block_or_cycle - map string into integer diff --git a/testsuite/slurm_unit/api/manual/submit-tst.c b/testsuite/slurm_unit/api/manual/submit-tst.c index bbe93fb134a4f16e5e1735d4c9f62183ce21ce8c..2d4201f183c2930af044417defcb0673448e3449 100644 --- a/testsuite/slurm_unit/api/manual/submit-tst.c +++ b/testsuite/slurm_unit/api/manual/submit-tst.c @@ -12,6 +12,7 @@ main (int argc, char *argv[]) { int error_code, i, count; job_desc_msg_t job_mesg; + submit_response_msg_t *resp_msg; slurm_init_job_desc_msg( &job_mesg ); job_mesg. contiguous = 1; @@ -31,13 +32,13 @@ main (int argc, char *argv[]) job_mesg. num_nodes = 400; job_mesg. user_id = 1500; - error_code = slurm_submit_batch_job( &job_mesg ); + error_code = slurm_submit_batch_job( &job_mesg, &resp_msg ); if (error_code) { printf ("submit error %d\n", error_code); return (error_code); } else - printf ("job %u submitted\n", job_mesg.job_id); + printf ("job %u submitted\n", resp_msg->job_id); if (argc > 1) count = atoi (argv[1]); @@ -45,14 +46,29 @@ main (int argc, char *argv[]) count = 5; for (i=0; i<count; i++) { - job_mesg.job_id = job_mesg.job_id + i; - error_code = slurm_submit_batch_job( &job_mesg); + slurm_init_job_desc_msg( &job_mesg ); + job_mesg. contiguous = 1; + job_mesg. groups = ("students,employee\0"); + job_mesg. name = ("job01\0"); + job_mesg. partition_key = NULL; + job_mesg. min_procs = 4; + job_mesg. min_memory = 1024 + i; + job_mesg. min_tmp_disk = 2034 + i; + job_mesg. partition = "batch\0"; + job_mesg. priority = 100 + i; + job_mesg. job_script = "/bin/hostname\0"; + job_mesg. shared = 0; + job_mesg. time_limit = 100 + i; + job_mesg. num_procs = 1000 + i; + job_mesg. num_nodes = 400 + i; + job_mesg. user_id = 1500; + error_code = slurm_submit_batch_job( &job_mesg, &resp_msg ); if (error_code) { printf ("submit error %d\n", error_code); break; } else { - printf ("job %u submitted\n", job_mesg.job_id); + printf ("job %u submitted\n", resp_msg->job_id); } }