From c45b5c4689a05a862405d3dafca60221d0bb723a Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 25 Aug 2005 19:23:53 +0000 Subject: [PATCH] Add name and network parameters to job step information, reported by squeue and scontrol, data saved/restored by slurmctld, etc. --- NEWS | 2 ++ doc/man/man1/squeue.1 | 18 ++++++++-------- slurm/slurm.h.in | 4 ++++ src/api/job_step_info.c | 5 +++-- src/common/slurm_protocol_defs.c | 2 ++ src/common/slurm_protocol_pack.c | 19 ++++++++++++++--- src/common/slurm_protocol_pack.h | 3 ++- src/slurmctld/job_mgr.c | 11 +++++++++- src/slurmctld/proc_req.c | 5 ++++- src/slurmctld/slurmctld.h | 7 ++++--- src/slurmctld/step_mgr.c | 35 +++++++++++++++++++++++--------- src/squeue/opts.c | 6 ++++++ src/squeue/print.c | 12 +++++++++++ src/squeue/print.h | 4 ++++ src/squeue/squeue.c | 2 +- src/srun/allocate.c | 2 ++ 16 files changed, 106 insertions(+), 31 deletions(-) diff --git a/NEWS b/NEWS index e34b78336ef..489f0badb4d 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,8 @@ documents those changes that are of interest to users and admins. ============================= -- Add code so job request for shared nodes gets explicitly requested nodes, but lightly loaded nodes otherwise. + -- Add job step name field. + -- Add job step network specification field. -- Add proctrack/rms plugin -- Change the proctrack API to send a slurmd_job_t pointer to both slurm_container_create() and slurm_container_add(). One of those diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index 34195c5ada2..0afbf02c22c 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -1,4 +1,4 @@ -.TH SQUEUE "1" "July 2005" "squeue 0.5" "Slurm components" +.TH SQUEUE "1" "August 2005" "squeue 0.6" "Slurm components" .SH "NAME" squeue \- view information about jobs located in the SLURM scheduling queue. @@ -62,7 +62,7 @@ various options are ".7i %.9P %.8j %.8u %.8T %.9M %.9l %.6D %R" .TP .I "-s, --steps" -"%10i %.9P %.8u %.9M %N" +"%10i %.8j %.9P %.8u %.9M %N" .RE .IP @@ -118,7 +118,7 @@ Can the nodes allocated to the job be shared with other jobs Job or job step id .TP \fB%j\fR -Job name +Job or job step name .TP \fB%l\fR Time limit of the job in days:hours:minutes:seconds. @@ -355,13 +355,13 @@ Print the job steps in the debug partition sorted by user: .br # squeue -s -p debug -S u .br - STEPID PARTITION USER TIME_USED NODELIST(REASON) + STEPID NAME PARTITION USER TIME_USED NODELIST(REASON) .br - 65552.1 debug alice 0:23 dev[1-4] + 65552.1 test1 debug alice 0:23 dev[1-4] .br - 65562.2 debug bob 0:18 dev22 + 65562.2 big_run debug bob 0:18 dev22 .br - 65550.1 debug candice 1:43:21 dev[6-12] + 65550.1 param1 debug candice 1:43:21 dev[6-12] .ec .eo @@ -383,9 +383,9 @@ Print information only about job step 65552.1: .br # squeue --steps 65552.1 .br - STEPID PARTITION USER TIME_USED NODELIST(REASON) + STEPID NAME PARTITION USER TIME_USED NODELIST(REASON) .br - 65552.1 debug alice 12:49 dev[1-4] + 65552.1 test2 debug alice 12:49 dev[1-4] .ec .SH "COPYING" diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 14700786571..db0f0bcfcc0 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -379,6 +379,8 @@ typedef struct job_step_specs { uint16_t port; /* port to contact initiating srun */ char *host; /* host to contact initiating srun */ char *node_list; /* list of required nodes */ + char *network; /* network use spec */ + char *name; /* name of the job step, default "" */ } job_step_create_request_msg_t; typedef struct job_step_create_response_msg { @@ -396,6 +398,8 @@ typedef struct { time_t start_time; /* step start time */ char *partition; /* name of assigned partition */ char *nodes; /* list of nodes allocated to job_step */ + char *name; /* name of job step */ + char *network; /* network specs for job step */ } job_step_info_t; typedef struct job_step_info_response_msg { diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c index b1905bf7559..04ab48a3231 100644 --- a/src/api/job_step_info.c +++ b/src/api/job_step_info.c @@ -91,8 +91,9 @@ slurm_print_job_step_info ( FILE* out, job_step_info_t * job_step_ptr, fprintf ( out, "\n "); /****** Line 2 ******/ - fprintf ( out, "Partition=%s Nodes=%s\n\n", - job_step_ptr->partition, job_step_ptr->nodes); + fprintf ( out, "Partition=%s Nodes=%s Name=%s Network=%s\n\n", + job_step_ptr->partition, job_step_ptr->nodes, + job_step_ptr->name, job_step_ptr->network); } /* diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index f4de323ad19..212c79f3eb3 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -254,6 +254,8 @@ void slurm_free_job_step_create_request_msg(job_step_create_request_msg_t * msg) { if (msg) { + xfree(msg->name); + xfree(msg->network); xfree(msg->node_list); xfree(msg->host); xfree(msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index f4a0fd0bf6d..163a2630216 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1296,6 +1296,8 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t pack16(msg->task_dist, buffer); pack16(msg->port, buffer); packstr(msg->host, buffer); + packstr(msg->name, buffer); + packstr(msg->network, buffer); packstr(msg->node_list, buffer); } @@ -1321,12 +1323,16 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg, safe_unpack16(&(tmp_ptr->task_dist), buffer); safe_unpack16(&(tmp_ptr->port), buffer); safe_unpackstr_xmalloc(&(tmp_ptr->host), &uint16_tmp, buffer); + safe_unpackstr_xmalloc(&(tmp_ptr->name), &uint16_tmp, buffer); + safe_unpackstr_xmalloc(&(tmp_ptr->network), &uint16_tmp, buffer); safe_unpackstr_xmalloc(&(tmp_ptr->node_list), &uint16_tmp, buffer); return SLURM_SUCCESS; unpack_error: xfree(tmp_ptr->host); + xfree(tmp_ptr->name); + xfree(tmp_ptr->network); xfree(tmp_ptr->node_list); xfree(tmp_ptr); *msg = NULL; @@ -1572,7 +1578,8 @@ void pack_job_step_info_members(uint32_t job_id, uint16_t step_id, uint32_t user_id, uint32_t num_tasks, time_t start_time, char *partition, - char *nodes, Buf buffer) + char *nodes, char *name, char *network, + Buf buffer) { pack32(job_id, buffer); pack16(step_id, buffer); @@ -1582,7 +1589,8 @@ pack_job_step_info_members(uint32_t job_id, uint16_t step_id, pack_time(start_time, buffer); packstr(partition, buffer); packstr(nodes, buffer); - + packstr(name, buffer); + packstr(network, buffer); } /* pack_job_step_info @@ -1599,7 +1607,8 @@ pack_job_step_info(job_step_info_t * step, Buf buffer) step->user_id, step->num_tasks, step->start_time, - step->partition, step->nodes, buffer); + step->partition, step->nodes, + step->name, step->network, buffer); } /* _unpack_job_step_info_members @@ -1621,12 +1630,16 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer) safe_unpack_time(&step->start_time, buffer); safe_unpackstr_xmalloc(&step->partition, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&step->nodes, &uint16_tmp, buffer); + safe_unpackstr_xmalloc(&step->name, &uint16_tmp, buffer); + safe_unpackstr_xmalloc(&step->network, &uint16_tmp, buffer); return SLURM_SUCCESS; unpack_error: xfree(step->partition); xfree(step->nodes); + xfree(step->name); + xfree(step->network); return SLURM_ERROR; } diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 3d5cfe58cb0..363abfd302e 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -124,6 +124,7 @@ void pack_job_step_info ( job_step_info_t* step, Buf buffer ); */ void pack_job_step_info_members( uint32_t job_id, uint16_t step_id, uint32_t user_id, uint32_t num_tasks, time_t start_time, - char *partition, char *nodes, Buf buffer ); + char *partition, char *nodes, char *name, char *network, + Buf buffer ); #endif diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 2e4f818c5de..aca736508e2 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -72,7 +72,7 @@ #define JOB_HASH_INX(_job_id) (_job_id % hash_table_size) -#define JOB_STATE_VERSION "VER001" +#define JOB_STATE_VERSION "VER002" /* Global variables */ List job_list = NULL; /* job_record list */ @@ -816,6 +816,8 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack_time(step_ptr->start_time, buffer); packstr(step_ptr->host, buffer); packstr(step_ptr->step_node_list, buffer); + packstr(step_ptr->name, buffer); + packstr(step_ptr->network, buffer); pack16(step_ptr->batch_step, buffer); if (!step_ptr->batch_step) switch_pack_jobinfo(step_ptr->switch_job, buffer); @@ -830,6 +832,7 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) uint32_t num_tasks; time_t start_time; char *step_node_list = NULL, *host = NULL; + char *name = NULL, *network = NULL; switch_jobinfo_t switch_tmp = NULL; check_jobinfo_t check_tmp = NULL; @@ -840,6 +843,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) safe_unpack_time(&start_time, buffer); safe_unpackstr_xmalloc(&host, &name_len, buffer); safe_unpackstr_xmalloc(&step_node_list, &name_len, buffer); + safe_unpackstr_xmalloc(&name, &name_len, buffer); + safe_unpackstr_xmalloc(&network, &name_len, buffer); safe_unpack16(&batch_step, buffer); if (!batch_step) { switch_alloc_jobinfo(&switch_tmp); @@ -869,6 +874,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) /* set new values */ step_ptr->step_id = step_id; step_ptr->cyclic_alloc = cyclic_alloc; + step_ptr->name = name; + step_ptr->network = network; step_ptr->num_tasks = num_tasks; step_ptr->port = port; step_ptr->host = host; @@ -885,6 +892,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) unpack_error: xfree(host); + xfree(name); + xfree(network); xfree(step_node_list); if (switch_tmp) switch_free_jobinfo(switch_tmp); return SLURM_FAILURE; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 8598dbf6b37..067911402b7 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -558,6 +558,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) req_step_msg.node_count = INFINITE; req_step_msg.cpu_count = job_desc_msg->num_procs; #endif + req_step_msg.name = job_ptr->name; + req_step_msg.network = job_ptr->network; req_step_msg.num_tasks = job_desc_msg->num_tasks; req_step_msg.task_dist = job_desc_msg->task_dist; error_code = step_create(&req_step_msg, &step_rec, true, false); @@ -2037,8 +2039,9 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, req_step_msg.task_dist = SLURM_DIST_CYCLIC; req_step_msg.port = 0; req_step_msg.host = NULL; + req_step_msg.name = NULL; + req_step_msg.network = NULL; req_step_msg.node_list = NULL; - START_TIMER; lock_slurmctld(job_write_lock); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 97ed8ad7225..14307de760a 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -349,9 +349,10 @@ struct step_record { uint16_t batch_step; /* 1 if batch job step, 0 otherwise */ switch_jobinfo_t switch_job; /* switch context, opaque */ check_jobinfo_t check_job; /* checkpoint context, opaque */ + char *name; /* name of job step */ + char *network; /* step's network specification */ }; -typedef struct job_step_specs step_specs; extern List job_list; /* list of job_record entries */ extern List job_list; /* list of job_record entries */ @@ -506,7 +507,7 @@ extern void dump_job_desc(job_desc_msg_t * job_specs); * dump_step_desc - dump the incoming step initiate request message * IN step_spec - job step request specification from RPC */ -extern void dump_step_desc(step_specs *step_spec); +extern void dump_step_desc(job_step_create_request_msg_t *step_spec); /* * find_job_record - return a pointer to the job record with the given job_id @@ -1107,7 +1108,7 @@ extern int slurmctld_shutdown(void); * NOTE: don't free the returned step_record because that is managed through * the job. */ -extern int step_create ( step_specs *step_specs, +extern int step_create ( job_step_create_request_msg_t *step_specs, struct step_record** new_step_record, bool kill_job_when_step_done, bool batch_step ); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index ab3558c6f26..701a8e4aa23 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -56,7 +56,7 @@ static int _job_step_ckpt_error(struct step_record *step_ptr, slurm_fd conn_fd); static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer); static bitstr_t * _pick_step_nodes (struct job_record *job_ptr, - step_specs *step_spec ); + job_step_create_request_msg_t *step_spec ); /* * create_step_record - create an empty step_record for the specified job. * IN job_ptr - pointer to job table entry to have step record added @@ -166,7 +166,7 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) * IN step_spec - job step request specification from RPC */ void -dump_step_desc(step_specs *step_spec) +dump_step_desc(job_step_create_request_msg_t *step_spec) { if (step_spec == NULL) return; @@ -177,8 +177,9 @@ dump_step_desc(step_specs *step_spec) debug3(" num_tasks=%u relative=%u task_dist=%u node_list=%s", step_spec->num_tasks, step_spec->relative, step_spec->task_dist, step_spec->node_list); - debug3(" host=%s port=%u", - step_spec->host, step_spec->port); + debug3(" host=%s port=%u name=%s network=%s", + step_spec->host, step_spec->port, step_spec->name, + step_spec->network); } @@ -366,7 +367,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, * NOTE: returned bitmap must be freed by the caller using bit_free() */ static bitstr_t * -_pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ) +_pick_step_nodes (struct job_record *job_ptr, + job_step_create_request_msg_t *step_spec ) { bitstr_t *nodes_avail = NULL, *nodes_picked = NULL, *node_tmp = NULL; @@ -488,9 +490,10 @@ cleanup: * NOTE: don't free the returned step_record because that is managed through * the job. */ -int -step_create ( step_specs *step_specs, struct step_record** new_step_record, - bool kill_job_when_step_done, bool batch_step ) +extern int +step_create ( job_step_create_request_msg_t *step_specs, + struct step_record** new_step_record, + bool kill_job_when_step_done, bool batch_step ) { struct step_record *step_ptr; struct job_record *job_ptr; @@ -554,6 +557,17 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, step_ptr->host = xstrdup(step_specs->host); step_ptr->batch_step = batch_step; + /* step's name and network default to job's values if not + * specified in the step specification */ + if (step_specs->name && step_specs->name[0]) + step_ptr->name = xstrdup(step_specs->name); + else + step_ptr->name = xstrdup(job_ptr->name); + if (step_specs->network && step_specs->network[0]) + step_ptr->network = xstrdup(step_specs->network); + else + step_ptr->network = xstrdup(job_ptr->network); + /* a batch script does not need switch info */ if (!batch_step) { int *tasks_per_node; @@ -571,7 +585,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, step_ptr->step_node_list, tasks_per_node, step_ptr->cyclic_alloc, - job_ptr->network) < 0) { + step_ptr->network) < 0) { error("switch_build_jobinfo: %m"); xfree(tasks_per_node); delete_step_record (job_ptr, step_ptr->step_id); @@ -598,7 +612,8 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer) step->num_tasks, step->start_time, step->job_ptr->partition, - step->step_node_list, buffer); + step->step_node_list, + step->name, step->network, buffer); } /* diff --git a/src/squeue/opts.c b/src/squeue/opts.c index d5c822f3b21..03c6a5df19a 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -335,6 +335,12 @@ extern int parse_format( char* format ) step_format_add_id( params.format_list, field_size, right_justify, suffix ); + else if (field[0] == 'j') + step_format_add_name( params.format_list, + field_size, + right_justify, + suffix ); + else if (field[0] == 'M') step_format_add_time_used( params.format_list, field_size, diff --git a/src/squeue/print.c b/src/squeue/print.c index 246bca7ce0b..23c9ee985fb 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -950,6 +950,18 @@ int _print_step_time_used(job_step_info_t * step, int width, bool right, return SLURM_SUCCESS; } +int _print_step_name(job_step_info_t * step, int width, bool right, + char* suffix) +{ + if (step == NULL) /* Print the Header instead */ + _print_str("NAME", width, right, true); + else + _print_nodes(step->name, width, right, true); + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + int _print_step_nodes(job_step_info_t * step, int width, bool right, char* suffix) { diff --git a/src/squeue/print.h b/src/squeue/print.h index 7c879fddea3..0d34ee54db9 100644 --- a/src/squeue/print.h +++ b/src/squeue/print.h @@ -236,6 +236,8 @@ int step_format_add_function(List list, int width, bool right_justify, step_format_add_function(list,wid,right,suffix,_print_step_time_used) #define step_format_add_nodes(list,wid,right,suffix) \ step_format_add_function(list,wid,right,suffix,_print_step_nodes) +#define step_format_add_name(list,wid,right,suffix) \ + step_format_add_function(list,wid,right,suffix,_print_step_name) /***************************************************************************** * Step Line Print Functions @@ -254,6 +256,8 @@ int _print_step_time_start(job_step_info_t * step, int width, bool right_justify, char *suffix); int _print_step_time_used(job_step_info_t * step, int width, bool right_justify, char *suffix); +int _print_step_name(job_step_info_t * step, int width, + bool right_justify, char *suffix); int _print_step_nodes(job_step_info_t * step, int width, bool right_justify, char *suffix); diff --git a/src/squeue/squeue.c b/src/squeue/squeue.c index 03b44545d2f..f7d05e94eac 100644 --- a/src/squeue/squeue.c +++ b/src/squeue/squeue.c @@ -197,7 +197,7 @@ _print_job_steps( void ) (long) new_step_ptr->last_update); if (params.format == NULL) - params.format = "%10i %.9P %.8u %.9M %N"; + params.format = "%10i %.8j %.9P %.8u %.9M %N"; if (params.format_list == NULL) parse_format(params.format); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 22e4a9131a9..6579b34a4b8 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -465,6 +465,8 @@ _step_req_create(srun_job_t *j) : (opt.nprocs*opt.cpus_per_task); r->num_tasks = opt.nprocs; r->node_list = j->nodelist; + r->network = opt.network; + r->name = opt.job_name; r->relative = false; /* XXX fix this oneday */ switch (opt.distribution) { -- GitLab