Skip to content
Snippets Groups Projects
Commit 950a7a9c authored by jce's avatar jce
Browse files

cleaned up some of the job_step stuff.

parent cdd2dc3f
No related branches found
No related tags found
No related merge requests found
...@@ -220,12 +220,16 @@ typedef struct slurm_node_registration_status_msg ...@@ -220,12 +220,16 @@ typedef struct slurm_node_registration_status_msg
typedef struct job_step_create_request_msg typedef struct job_step_create_request_msg
{ {
uint32_t job_id;
uint32_t user_id;
uint32_t node_count; uint32_t node_count;
uint32_t proc_count; uint32_t cpu_count;
uint16_t relative; uint16_t relative;
char* node_list; char* node_list;
} job_step_create_request_msg_t; } job_step_create_request_msg_t;
typedef struct job_step_create_request_msg job_step_specs_t;
typedef struct job_step_create_response_msg typedef struct job_step_create_response_msg
{ {
uint32_t job_step_id; uint32_t job_step_id;
......
...@@ -540,8 +540,10 @@ void pack_job_step_create_request_msg ( job_step_create_request_msg_t* msg , voi ...@@ -540,8 +540,10 @@ void pack_job_step_create_request_msg ( job_step_create_request_msg_t* msg , voi
{ {
assert ( msg != NULL ); assert ( msg != NULL );
pack32 ( msg -> job_id, ( void ** ) buffer , length ) ;
pack32 ( msg -> user_id, ( void ** ) buffer , length ) ;
pack32 ( msg -> node_count, ( void ** ) buffer , length ) ; pack32 ( msg -> node_count, ( void ** ) buffer , length ) ;
pack32 ( msg -> proc_count, ( void ** ) buffer , length ) ; pack32 ( msg -> cpu_count, ( void ** ) buffer , length ) ;
pack16 ( msg -> relative, ( void ** ) buffer , length ) ; pack16 ( msg -> relative, ( void ** ) buffer , length ) ;
packstr ( msg -> node_list, ( void ** ) buffer , length ) ; packstr ( msg -> node_list, ( void ** ) buffer , length ) ;
} }
...@@ -555,8 +557,10 @@ int unpack_job_step_create_request_msg ( job_step_create_request_msg_t** msg , v ...@@ -555,8 +557,10 @@ int unpack_job_step_create_request_msg ( job_step_create_request_msg_t** msg , v
if (tmp_ptr == NULL) if (tmp_ptr == NULL)
return ENOMEM; return ENOMEM;
unpack32 ( &( tmp_ptr -> job_id), ( void ** ) buffer , length ) ;
unpack32 ( &( tmp_ptr -> user_id), ( void ** ) buffer , length ) ;
unpack32 ( &( tmp_ptr -> node_count), ( void ** ) buffer , length ) ; unpack32 ( &( tmp_ptr -> node_count), ( void ** ) buffer , length ) ;
unpack32 ( &( tmp_ptr -> proc_count), ( void ** ) buffer , length ) ; unpack32 ( &( tmp_ptr -> cpu_count), ( void ** ) buffer , length ) ;
unpack16 ( &( tmp_ptr -> relative), ( void ** ) buffer , length ) ; unpack16 ( &( tmp_ptr -> relative), ( void ** ) buffer , length ) ;
unpackstr_xmalloc ( &( tmp_ptr -> node_list ), &uint16_tmp, ( void ** ) buffer , length ) ; unpackstr_xmalloc ( &( tmp_ptr -> node_list ), &uint16_tmp, ( void ** ) buffer , length ) ;
......
...@@ -669,38 +669,39 @@ void ...@@ -669,38 +669,39 @@ void
slurm_rpc_job_step_create( slurm_msg_t* msg ) slurm_rpc_job_step_create( slurm_msg_t* msg )
{ {
/* init */ /* init */
int error_code=0; int error_code;
clock_t start_time; clock_t start_time;
slurm_msg_t resp; slurm_msg_t resp;
struct step_record* step_rec;
job_step_create_response_msg_t job_step_resp; job_step_create_response_msg_t job_step_resp;
job_step_create_request_msg_t * req_step_msg = job_step_create_request_msg_t * req_step_msg =
( job_step_create_request_msg_t* ) msg-> data ; ( job_step_create_request_msg_t* ) msg-> data ;
start_time = clock (); start_time = clock ();
/* do RPC call */ error_code = step_create ( req_step_msg, &step_rec );
/* error_code = job_step_cancel ( job_step_id_msg->job_id ,
job_step_id_msg->job_step_id); /* return result */
*/ /* return result */ if ( step_rec == NULL )
if (error_code)
{ {
info ("job_step_create error %d time=%ld", error_code, info ("job_step_create error %s time=%ld", slurm_strerror( error_code ),
(long) (clock () - start_time)); (long) (clock () - start_time));
slurm_send_rc_msg ( msg , error_code ); slurm_send_rc_msg ( msg , error_code );
} }
else else
{ {
/* FIXME Needs to be fixed to really work with a credential */
slurm_job_credential_t cred = { 1,1,"test",start_time,0} ; slurm_job_credential_t cred = { 1,1,"test",start_time,0} ;
info ("job_step_create success time=%ld", info ("job_step_create success time=%ld",
(long) (clock () - start_time)); (long) (clock () - start_time));
job_step_resp.job_step_id = 23; job_step_resp.job_step_id = step_rec->step_id;
job_step_resp.node_list = cred.node_list; bitmap2node_name( step_rec->node_bitmap, &(job_step_resp.node_list) );
job_step_resp.credentials = &cred; job_step_resp.credentials = &cred;
#ifdef HAVE_LIBELAN3 #ifdef HAVE_LIBELAN3
/* FIXME */ /* FIXME */
resp.qsw_job; /* Elan3 switch context, opaque data structure */
#endif #endif
resp. address = msg -> address ; resp. address = msg -> address ;
resp. msg_type = RESPONSE_JOB_STEP_CREATE ; resp. msg_type = RESPONSE_JOB_STEP_CREATE ;
......
...@@ -199,15 +199,7 @@ struct step_record { ...@@ -199,15 +199,7 @@ struct step_record {
#endif #endif
}; };
struct step_specs { typedef struct job_step_create_request_msg step_specs;
uint32_t job_id; /* job ID */
uint32_t step_id; /* step number */
uint32_t user_id; /* user the job runs as */
uint32_t min_nodes; /* count of required nodes */
uint32_t min_cpus; /* count of required processors */
char *node_list; /* list of required nodes */
char *relative_node_list; /* relative positions of required nodes */
};
extern List job_list; /* list of job_record entries */ extern List job_list; /* list of job_record entries */
...@@ -690,7 +682,7 @@ extern int slurm_parser (char *spec, ...); ...@@ -690,7 +682,7 @@ extern int slurm_parser (char *spec, ...);
* output: returns 0 on success, EINVAL if specification is invalid * output: returns 0 on success, EINVAL if specification is invalid
* NOTE: the calling program must xfree the memory pointed to by new_job_id * NOTE: the calling program must xfree the memory pointed to by new_job_id
*/ */
extern int step_create (struct step_specs *step_specs); extern int step_create ( step_specs *step_specs, struct step_record** );
/* step_lock - lock the step information /* step_lock - lock the step information
* global: step_mutex - semaphore for the step table * global: step_mutex - semaphore for the step table
......
...@@ -40,8 +40,7 @@ ...@@ -40,8 +40,7 @@
#define BUF_SIZE 1024 #define BUF_SIZE 1024
bitstr_t * pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, bitstr_t * pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec );
char *node_list, char *relative_node_list);
/* /*
* create_step_record - create an empty step_record for the specified job. * create_step_record - create an empty step_record for the specified job.
...@@ -249,12 +248,12 @@ pack_step (struct step_record *dump_step_ptr, void **buf_ptr, int *buf_len) ...@@ -249,12 +248,12 @@ pack_step (struct step_record *dump_step_ptr, void **buf_ptr, int *buf_len)
* pick_step_nodes - select nodes for a job step that satify its requirements * pick_step_nodes - select nodes for a job step that satify its requirements
* we satify the super-set of constraints. * we satify the super-set of constraints.
* global: node_record_table_ptr - pointer to global node table * global: node_record_table_ptr - pointer to global node table
* NOTE: returns all of a job's nodes if min_nodes == INFINITE * NOTE: returns all of a job's nodes if step_spec->node_count == INFINITE
* NOTE: returned bitmap must be freed by the caller using bit_free() * NOTE: returned bitmap must be freed by the caller using bit_free()
*/ */
bitstr_t * bitstr_t *
pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ) {
char *node_list, char *relative_node_list) {
bitstr_t *nodes_avail = NULL, *nodes_picked = NULL, *node_tmp = NULL; bitstr_t *nodes_avail = NULL, *nodes_picked = NULL, *node_tmp = NULL;
int error_code, nodes_picked_cnt = 0, cpus_picked_cnt, i; int error_code, nodes_picked_cnt = 0, cpus_picked_cnt, i;
...@@ -263,41 +262,43 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, ...@@ -263,41 +262,43 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus,
nodes_avail = bit_copy(job_ptr->node_bitmap); nodes_avail = bit_copy(job_ptr->node_bitmap);
if (min_nodes == INFINITE) /* return all available nodes */ if ( step_spec->node_count == INFINITE) /* return all available nodes */
return nodes_avail; return nodes_avail;
if (node_list) { if (step_spec->node_list) {
error_code = node_name2bitmap (node_list, &nodes_picked); if ( step_spec->relative ) {
if (error_code) { /* FIXME need to resolve format of relative_node_list */
info ("pick_step_nodes: invalid node list %s", node_list); info ("pick_step_nodes: relative_node_list not yet supported");
goto cleanup;
} }
if (bit_super_set (nodes_picked, job_ptr->node_bitmap) == 0) { else {
info ("pick_step_nodes: requested nodes %s not part of job %u", error_code = node_name2bitmap (step_spec->node_list, &nodes_picked);
node_list, job_ptr->job_id); if (error_code) {
goto cleanup; info ("pick_step_nodes: invalid node list %s", step_spec->node_list);
goto cleanup;
}
if (bit_super_set (nodes_picked, job_ptr->node_bitmap) == 0) {
info ("pick_step_nodes: requested nodes %s not part of job %u",
step_spec->node_list, job_ptr->job_id);
goto cleanup;
}
} }
} }
else else
nodes_picked = bit_alloc (bit_size (nodes_avail) ); nodes_picked = bit_alloc (bit_size (nodes_avail) );
if (relative_node_list) {
/* need to resolve format of relative_node_list */
info ("pick_step_nodes: relative_node_list not yet supported");
}
/* if user specifies step needs a specific processor count and all nodes */ /* if user specifies step needs a specific processor count and all nodes */
/* have the same processor count, just translate this to a node count */ /* have the same processor count, just translate this to a node count */
if (min_cpus && (job_ptr->num_cpu_groups == 1)) { if (step_spec->cpu_count && (job_ptr->num_cpu_groups == 1)) {
i = (min_cpus + (job_ptr->cpus_per_node[0] - 1) ) / job_ptr->cpus_per_node[0]; i = (step_spec->cpu_count + (job_ptr->cpus_per_node[0] - 1) ) / job_ptr->cpus_per_node[0];
min_nodes = (i > min_nodes) ? i : min_nodes; step_spec->node_count = (i > step_spec->node_count) ? i : step_spec->node_count ;
min_cpus = 0; step_spec->cpu_count = 0;
} }
if (min_nodes) { if (step_spec->node_count) {
nodes_picked_cnt = bit_set_count(nodes_picked); nodes_picked_cnt = bit_set_count(nodes_picked);
if (min_nodes > nodes_picked_cnt) { if (step_spec->node_count > nodes_picked_cnt) {
node_tmp = bit_pick_cnt(nodes_avail, (min_nodes - nodes_picked_cnt)); node_tmp = bit_pick_cnt(nodes_avail, (step_spec->node_count - nodes_picked_cnt));
if (node_tmp == NULL) if (node_tmp == NULL)
goto cleanup; goto cleanup;
bit_or (nodes_picked, node_tmp); bit_or (nodes_picked, node_tmp);
...@@ -305,13 +306,13 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, ...@@ -305,13 +306,13 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus,
bit_and (nodes_avail, node_tmp); bit_and (nodes_avail, node_tmp);
bit_free (node_tmp); bit_free (node_tmp);
node_tmp = NULL; node_tmp = NULL;
nodes_picked_cnt = min_nodes; nodes_picked_cnt = step_spec->node_count;
} }
} }
if (min_cpus) { if (step_spec->cpu_count) {
cpus_picked_cnt = count_cpus(nodes_picked); cpus_picked_cnt = count_cpus(nodes_picked);
if (min_cpus > cpus_picked_cnt) { if (step_spec->cpu_count > cpus_picked_cnt) {
int first_bit, last_bit; int first_bit, last_bit;
first_bit = bit_ffs(nodes_avail); first_bit = bit_ffs(nodes_avail);
last_bit = bit_fls(nodes_avail); last_bit = bit_fls(nodes_avail);
...@@ -320,10 +321,10 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus, ...@@ -320,10 +321,10 @@ pick_step_nodes (struct job_record *job_ptr, int min_nodes, int min_cpus,
continue; continue;
bit_set (nodes_picked, i); bit_set (nodes_picked, i);
cpus_picked_cnt += node_record_table_ptr[i].cpus; cpus_picked_cnt += node_record_table_ptr[i].cpus;
if (cpus_picked_cnt >= min_cpus) if (cpus_picked_cnt >= step_spec->cpu_count)
break; break;
} }
if (min_cpus > cpus_picked_cnt) if (step_spec->cpu_count > cpus_picked_cnt)
goto cleanup; goto cleanup;
} }
} }
...@@ -342,14 +343,16 @@ cleanup: ...@@ -342,14 +343,16 @@ cleanup:
/* /*
* step_create - parse the suppied job step specification and create step_records for it * step_create - creates a step_record in step_specs->job_id, sets up the
* accoding to the step_specs.
* input: step_specs - job step specifications * input: step_specs - job step specifications
* output: returns 0 on success, EINVAL if specification is invalid * output: SUCCESS: returns a pointer to the step_record
* globals: step_list - pointer to global job step list * FAILURE: sets slurm_srrno appropriately and returns
* NOTE: the calling program must xfree the memory pointed to by new_job_id * NOTE: don't free the returned step_record because that is managed through
* the job.
*/ */
int int
step_create (struct step_specs *step_specs) step_create ( step_specs *step_specs, struct step_record** new_step_record )
{ {
struct step_record *step_ptr; struct step_record *step_ptr;
struct job_record *job_ptr; struct job_record *job_ptr;
...@@ -360,21 +363,26 @@ step_create (struct step_specs *step_specs) ...@@ -360,21 +363,26 @@ step_create (struct step_specs *step_specs)
#endif #endif
job_ptr = find_job_record (step_specs->job_id); job_ptr = find_job_record (step_specs->job_id);
if (job_ptr == NULL) if (job_ptr == NULL)
return ESLURM_INVALID_JOB_ID; return ESLURM_INVALID_JOB_ID ;
if (step_specs->user_id != job_ptr->user_id && if (step_specs->user_id != job_ptr->user_id &&
step_specs->user_id != 0) step_specs->user_id != 0)
return ESLURM_ACCESS_DENIED; return ESLURM_ACCESS_DENIED ;
nodeset = pick_step_nodes (job_ptr, step_specs );
nodeset = pick_step_nodes (job_ptr, step_specs->min_nodes, step_specs->min_cpus,
step_specs->node_list, step_specs->relative_node_list);
if (nodeset == NULL) if (nodeset == NULL)
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
/* FIXME need to set the error codes and define them
* probably shouldn't exit w/ a fatal...
*/
step_ptr = create_step_record (job_ptr); step_ptr = create_step_record (job_ptr);
if (step_ptr == NULL) if (step_ptr == NULL)
fatal ("create_step_record failed with no memory"); fatal ("create_step_record failed with no memory");
/* set the step_record values */
step_ptr->step_id = (job_ptr->next_step_id)++; step_ptr->step_id = (job_ptr->next_step_id)++;
step_ptr->node_bitmap = nodeset; step_ptr->node_bitmap = nodeset;
...@@ -396,5 +404,7 @@ step_create (struct step_specs *step_specs) ...@@ -396,5 +404,7 @@ step_create (struct step_specs *step_specs)
fatal ("step_create: qsw_setup_jobinfo error"); fatal ("step_create: qsw_setup_jobinfo error");
bit_free (nodeset); bit_free (nodeset);
#endif #endif
return 0;
*new_step_record = step_ptr;
return SLURM_SUCCESS;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment