From 950a7a9c7f730d67ad6086e703a8a397eeb5ec2f Mon Sep 17 00:00:00 2001 From: jce <jce@unknown> Date: Fri, 12 Jul 2002 22:07:40 +0000 Subject: [PATCH] cleaned up some of the job_step stuff. --- src/common/slurm_protocol_defs.h | 6 +- src/common/slurm_protocol_pack.c | 8 ++- src/slurmctld/controller.c | 25 ++++---- src/slurmctld/slurmctld.h | 12 +--- src/slurmctld/step_mgr.c | 102 +++++++++++++++++-------------- 5 files changed, 82 insertions(+), 71 deletions(-) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 2c1c4d3184a..9550dc19ff1 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -220,12 +220,16 @@ typedef struct slurm_node_registration_status_msg typedef struct job_step_create_request_msg { + uint32_t job_id; + uint32_t user_id; uint32_t node_count; - uint32_t proc_count; + uint32_t cpu_count; uint16_t relative; char* node_list; } job_step_create_request_msg_t; +typedef struct job_step_create_request_msg job_step_specs_t; + typedef struct job_step_create_response_msg { uint32_t job_step_id; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 7286835b9d5..b86bb119908 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -540,8 +540,10 @@ void pack_job_step_create_request_msg ( job_step_create_request_msg_t* msg , voi { assert ( msg != NULL ); + pack32 ( msg -> job_id, ( void ** ) buffer , length ) ; + pack32 ( msg -> user_id, ( void ** ) buffer , length ) ; pack32 ( msg -> node_count, ( void ** ) buffer , length ) ; - pack32 ( msg -> proc_count, ( void ** ) buffer , length ) ; + pack32 ( msg -> cpu_count, ( void ** ) buffer , length ) ; pack16 ( msg -> relative, ( void ** ) buffer , length ) ; packstr ( msg -> node_list, ( void ** ) buffer , length ) ; } @@ -555,8 +557,10 @@ int unpack_job_step_create_request_msg ( job_step_create_request_msg_t** msg , v if (tmp_ptr == NULL) return ENOMEM; + unpack32 ( &( tmp_ptr -> job_id), ( void ** ) buffer , length ) ; + unpack32 ( &( tmp_ptr -> user_id), ( void ** ) buffer , length ) ; unpack32 ( &( tmp_ptr -> node_count), ( void ** ) buffer , length ) ; - unpack32 ( &( tmp_ptr -> proc_count), ( void ** ) buffer , length ) ; + unpack32 ( &( tmp_ptr -> cpu_count), ( void ** ) buffer , length ) ; unpack16 ( &( tmp_ptr -> relative), ( void ** ) buffer , length ) ; unpackstr_xmalloc ( &( tmp_ptr -> node_list ), &uint16_tmp, ( void ** ) buffer , length ) ; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index d1ba83f50fb..86f7b9fc164 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -669,38 +669,39 @@ void slurm_rpc_job_step_create( slurm_msg_t* msg ) { /* init */ - int error_code=0; + int error_code; clock_t start_time; slurm_msg_t resp; + struct step_record* step_rec; job_step_create_response_msg_t job_step_resp; job_step_create_request_msg_t * req_step_msg = ( job_step_create_request_msg_t* ) msg-> data ; start_time = clock (); - /* do RPC call */ -/* error_code = job_step_cancel ( job_step_id_msg->job_id , - job_step_id_msg->job_step_id); -*/ /* return result */ - if (error_code) + error_code = step_create ( req_step_msg, &step_rec ); + + /* return result */ + if ( step_rec == NULL ) { - info ("job_step_create error %d time=%ld", error_code, + info ("job_step_create error %s time=%ld", slurm_strerror( error_code ), (long) (clock () - start_time)); slurm_send_rc_msg ( msg , error_code ); } else { + /* FIXME Needs to be fixed to really work with a credential */ slurm_job_credential_t cred = { 1,1,"test",start_time,0} ; info ("job_step_create success time=%ld", (long) (clock () - start_time)); - - job_step_resp.job_step_id = 23; - job_step_resp.node_list = cred.node_list; - job_step_resp.credentials = &cred; + + job_step_resp.job_step_id = step_rec->step_id; + bitmap2node_name( step_rec->node_bitmap, &(job_step_resp.node_list) ); + job_step_resp.credentials = &cred; + #ifdef HAVE_LIBELAN3 /* FIXME */ - resp.qsw_job; /* Elan3 switch context, opaque data structure */ #endif resp. address = msg -> address ; resp. msg_type = RESPONSE_JOB_STEP_CREATE ; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index af0248af866..9c1674b3eb4 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -199,15 +199,7 @@ struct step_record { #endif }; -struct step_specs { - uint32_t job_id; /* job ID */ - uint32_t step_id; /* step number */ - uint32_t user_id; /* user the job runs as */ - uint32_t min_nodes; /* count of required nodes */ - uint32_t min_cpus; /* count of required processors */ - char *node_list; /* list of required nodes */ - char *relative_node_list; /* relative positions of required nodes */ -}; +typedef struct job_step_create_request_msg step_specs; extern List job_list; /* list of job_record entries */ @@ -690,7 +682,7 @@ extern int slurm_parser (char *spec, ...); * output: returns 0 on success, EINVAL if specification is invalid * NOTE: the calling program must xfree the memory pointed to by new_job_id */ -extern int step_create (struct step_specs *step_specs); +extern int step_create ( step_specs *step_specs, struct step_record** ); /* step_lock - lock the step information * global: step_mutex - semaphore for the step table diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index e005240451e..5e0a147f2d6 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -40,8 +40,7 @@ #define BUF_SIZE 1024 -bitstr_t * pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, - char *node_list, char *relative_node_list); +bitstr_t * pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ); /* * create_step_record - create an empty step_record for the specified job. @@ -249,12 +248,12 @@ pack_step (struct step_record *dump_step_ptr, void **buf_ptr, int *buf_len) * pick_step_nodes - select nodes for a job step that satify its requirements * we satify the super-set of constraints. * global: node_record_table_ptr - pointer to global node table - * NOTE: returns all of a job's nodes if min_nodes == INFINITE + * NOTE: returns all of a job's nodes if step_spec->node_count == INFINITE * NOTE: returned bitmap must be freed by the caller using bit_free() */ bitstr_t * -pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, - char *node_list, char *relative_node_list) { +pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ) { + bitstr_t *nodes_avail = NULL, *nodes_picked = NULL, *node_tmp = NULL; int error_code, nodes_picked_cnt = 0, cpus_picked_cnt, i; @@ -263,41 +262,43 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, nodes_avail = bit_copy(job_ptr->node_bitmap); - if (min_nodes == INFINITE) /* return all available nodes */ + if ( step_spec->node_count == INFINITE) /* return all available nodes */ return nodes_avail; - if (node_list) { - error_code = node_name2bitmap (node_list, &nodes_picked); - if (error_code) { - info ("pick_step_nodes: invalid node list %s", node_list); - goto cleanup; - } - if (bit_super_set (nodes_picked, job_ptr->node_bitmap) == 0) { - info ("pick_step_nodes: requested nodes %s not part of job %u", - node_list, job_ptr->job_id); - goto cleanup; + if (step_spec->node_list) { + if ( step_spec->relative ) { + /* FIXME need to resolve format of relative_node_list */ + info ("pick_step_nodes: relative_node_list not yet supported"); + + } + else { + error_code = node_name2bitmap (step_spec->node_list, &nodes_picked); + if (error_code) { + info ("pick_step_nodes: invalid node list %s", step_spec->node_list); + goto cleanup; + } + if (bit_super_set (nodes_picked, job_ptr->node_bitmap) == 0) { + info ("pick_step_nodes: requested nodes %s not part of job %u", + step_spec->node_list, job_ptr->job_id); + goto cleanup; + } } } else nodes_picked = bit_alloc (bit_size (nodes_avail) ); - if (relative_node_list) { -/* need to resolve format of relative_node_list */ - info ("pick_step_nodes: relative_node_list not yet supported"); - } - /* if user specifies step needs a specific processor count and all nodes */ /* have the same processor count, just translate this to a node count */ - if (min_cpus && (job_ptr->num_cpu_groups == 1)) { - i = (min_cpus + (job_ptr->cpus_per_node[0] - 1) ) / job_ptr->cpus_per_node[0]; - min_nodes = (i > min_nodes) ? i : min_nodes; - min_cpus = 0; + if (step_spec->cpu_count && (job_ptr->num_cpu_groups == 1)) { + i = (step_spec->cpu_count + (job_ptr->cpus_per_node[0] - 1) ) / job_ptr->cpus_per_node[0]; + step_spec->node_count = (i > step_spec->node_count) ? i : step_spec->node_count ; + step_spec->cpu_count = 0; } - if (min_nodes) { + if (step_spec->node_count) { nodes_picked_cnt = bit_set_count(nodes_picked); - if (min_nodes > nodes_picked_cnt) { - node_tmp = bit_pick_cnt(nodes_avail, (min_nodes - nodes_picked_cnt)); + if (step_spec->node_count > nodes_picked_cnt) { + node_tmp = bit_pick_cnt(nodes_avail, (step_spec->node_count - nodes_picked_cnt)); if (node_tmp == NULL) goto cleanup; bit_or (nodes_picked, node_tmp); @@ -305,13 +306,13 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, bit_and (nodes_avail, node_tmp); bit_free (node_tmp); node_tmp = NULL; - nodes_picked_cnt = min_nodes; + nodes_picked_cnt = step_spec->node_count; } } - if (min_cpus) { + if (step_spec->cpu_count) { cpus_picked_cnt = count_cpus(nodes_picked); - if (min_cpus > cpus_picked_cnt) { + if (step_spec->cpu_count > cpus_picked_cnt) { int first_bit, last_bit; first_bit = bit_ffs(nodes_avail); last_bit = bit_fls(nodes_avail); @@ -320,10 +321,10 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, continue; bit_set (nodes_picked, i); cpus_picked_cnt += node_record_table_ptr[i].cpus; - if (cpus_picked_cnt >= min_cpus) + if (cpus_picked_cnt >= step_spec->cpu_count) break; } - if (min_cpus > cpus_picked_cnt) + if (step_spec->cpu_count > cpus_picked_cnt) goto cleanup; } } @@ -342,14 +343,16 @@ cleanup: /* - * step_create - parse the suppied job step specification and create step_records for it + * step_create - creates a step_record in step_specs->job_id, sets up the + * accoding to the step_specs. * input: step_specs - job step specifications - * output: returns 0 on success, EINVAL if specification is invalid - * globals: step_list - pointer to global job step list - * NOTE: the calling program must xfree the memory pointed to by new_job_id + * output: SUCCESS: returns a pointer to the step_record + * FAILURE: sets slurm_srrno appropriately and returns + * NOTE: don't free the returned step_record because that is managed through + * the job. */ int -step_create (struct step_specs *step_specs) +step_create ( step_specs *step_specs, struct step_record** new_step_record ) { struct step_record *step_ptr; struct job_record *job_ptr; @@ -360,21 +363,26 @@ step_create (struct step_specs *step_specs) #endif job_ptr = find_job_record (step_specs->job_id); - if (job_ptr == NULL) - return ESLURM_INVALID_JOB_ID; + if (job_ptr == NULL) + return ESLURM_INVALID_JOB_ID ; + if (step_specs->user_id != job_ptr->user_id && - step_specs->user_id != 0) - return ESLURM_ACCESS_DENIED; + step_specs->user_id != 0) + return ESLURM_ACCESS_DENIED ; + + nodeset = pick_step_nodes (job_ptr, step_specs ); - nodeset = pick_step_nodes (job_ptr, step_specs->min_nodes, step_specs->min_cpus, - step_specs->node_list, step_specs->relative_node_list); if (nodeset == NULL) - return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; + return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ; + /* FIXME need to set the error codes and define them + * probably shouldn't exit w/ a fatal... + */ step_ptr = create_step_record (job_ptr); if (step_ptr == NULL) fatal ("create_step_record failed with no memory"); + /* set the step_record values */ step_ptr->step_id = (job_ptr->next_step_id)++; step_ptr->node_bitmap = nodeset; @@ -396,5 +404,7 @@ step_create (struct step_specs *step_specs) fatal ("step_create: qsw_setup_jobinfo error"); bit_free (nodeset); #endif - return 0; + + *new_step_record = step_ptr; + return SLURM_SUCCESS; } -- GitLab