From c45b5c4689a05a862405d3dafca60221d0bb723a Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 25 Aug 2005 19:23:53 +0000
Subject: [PATCH] Add name and network parameters to job step information,
 reported by squeue and scontrol, data saved/restored by slurmctld, etc.

---
 NEWS                             |  2 ++
 doc/man/man1/squeue.1            | 18 ++++++++--------
 slurm/slurm.h.in                 |  4 ++++
 src/api/job_step_info.c          |  5 +++--
 src/common/slurm_protocol_defs.c |  2 ++
 src/common/slurm_protocol_pack.c | 19 ++++++++++++++---
 src/common/slurm_protocol_pack.h |  3 ++-
 src/slurmctld/job_mgr.c          | 11 +++++++++-
 src/slurmctld/proc_req.c         |  5 ++++-
 src/slurmctld/slurmctld.h        |  7 ++++---
 src/slurmctld/step_mgr.c         | 35 +++++++++++++++++++++++---------
 src/squeue/opts.c                |  6 ++++++
 src/squeue/print.c               | 12 +++++++++++
 src/squeue/print.h               |  4 ++++
 src/squeue/squeue.c              |  2 +-
 src/srun/allocate.c              |  2 ++
 16 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/NEWS b/NEWS
index e34b78336ef..489f0badb4d 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,8 @@ documents those changes that are of interest to users and admins.
 =============================
  -- Add code so job request for shared nodes gets explicitly requested 
     nodes, but lightly loaded nodes otherwise.
+ -- Add job step name field.
+ -- Add job step network specification field.
  -- Add proctrack/rms plugin
  -- Change the proctrack API to send a slurmd_job_t pointer to both
     slurm_container_create() and slurm_container_add().  One of those
diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1
index 34195c5ada2..0afbf02c22c 100644
--- a/doc/man/man1/squeue.1
+++ b/doc/man/man1/squeue.1
@@ -1,4 +1,4 @@
-.TH SQUEUE "1" "July 2005" "squeue 0.5" "Slurm components"
+.TH SQUEUE "1" "August 2005" "squeue 0.6" "Slurm components"
 
 .SH "NAME"
 squeue \- view information about jobs located in the SLURM scheduling queue.
@@ -62,7 +62,7 @@ various options are
 ".7i %.9P %.8j %.8u %.8T %.9M %.9l %.6D %R"
 .TP
 .I "-s, --steps"
-"%10i %.9P %.8u %.9M %N"
+"%10i %.8j %.9P %.8u %.9M %N"
 .RE
 
 .IP
@@ -118,7 +118,7 @@ Can the nodes allocated to the job be shared with other jobs
 Job or job step id
 .TP
 \fB%j\fR
-Job name
+Job or job step name
 .TP
 \fB%l\fR
 Time limit of the job in days:hours:minutes:seconds. 
@@ -355,13 +355,13 @@ Print the job steps in the debug partition sorted by user:
 .br
 # squeue -s -p debug -S u
 .br
-  STEPID    PARTITION     USER TIME_USED NODELIST(REASON)
+  STEPID        NAME PARTITION     USER TIME_USED NODELIST(REASON)
 .br
- 65552.1        debug    alice      0:23 dev[1-4]
+ 65552.1       test1     debug    alice      0:23 dev[1-4]
 .br
- 65562.2        debug      bob      0:18 dev22
+ 65562.2     big_run     debug      bob      0:18 dev22
 .br
- 65550.1        debug  candice   1:43:21 dev[6-12]
+ 65550.1      param1     debug  candice   1:43:21 dev[6-12]
 .ec
 
 .eo
@@ -383,9 +383,9 @@ Print information only about job step 65552.1:
 .br
 # squeue --steps 65552.1
 .br
-  STEPID    PARTITION    USER    TIME_USED NODELIST(REASON)
+  STEPID     NAME PARTITION    USER    TIME_USED NODELIST(REASON)
 .br
- 65552.1        debug   alice        12:49 dev[1-4]
+ 65552.1    test2     debug   alice        12:49 dev[1-4]
 .ec
 
 .SH "COPYING"
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 14700786571..db0f0bcfcc0 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -379,6 +379,8 @@ typedef struct job_step_specs {
 	uint16_t port;		/* port to contact initiating srun */
 	char *host;		/* host to contact initiating srun */
 	char *node_list;	/* list of required nodes */
+	char *network;		/* network use spec */
+	char *name;		/* name of the job step, default "" */
 } job_step_create_request_msg_t;
 
 typedef struct job_step_create_response_msg {
@@ -396,6 +398,8 @@ typedef struct {
 	time_t start_time;	/* step start time */
 	char *partition;	/* name of assigned partition */
 	char *nodes;		/* list of nodes allocated to job_step */
+	char *name;		/* name of job step */
+	char *network;		/* network specs for job step */
 } job_step_info_t;
 
 typedef struct job_step_info_response_msg {
diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c
index b1905bf7559..04ab48a3231 100644
--- a/src/api/job_step_info.c
+++ b/src/api/job_step_info.c
@@ -91,8 +91,9 @@ slurm_print_job_step_info ( FILE* out, job_step_info_t * job_step_ptr,
 		fprintf ( out, "\n   ");
 
 	/****** Line 2 ******/
-	fprintf ( out, "Partition=%s Nodes=%s\n\n", 
-		job_step_ptr->partition, job_step_ptr->nodes);
+	fprintf ( out, "Partition=%s Nodes=%s Name=%s Network=%s\n\n", 
+		job_step_ptr->partition, job_step_ptr->nodes,
+		job_step_ptr->name, job_step_ptr->network);
 }
 
 /*
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index f4de323ad19..212c79f3eb3 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -254,6 +254,8 @@ void slurm_free_job_step_create_request_msg(job_step_create_request_msg_t *
 					    msg)
 {
 	if (msg) {
+		xfree(msg->name);
+		xfree(msg->network);
 		xfree(msg->node_list);
 		xfree(msg->host);
 		xfree(msg);
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index f4a0fd0bf6d..163a2630216 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -1296,6 +1296,8 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t
 	pack16(msg->task_dist, buffer);
 	pack16(msg->port, buffer);
 	packstr(msg->host, buffer);
+	packstr(msg->name, buffer);
+	packstr(msg->network, buffer);
 	packstr(msg->node_list, buffer);
 }
 
@@ -1321,12 +1323,16 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg,
 	safe_unpack16(&(tmp_ptr->task_dist), buffer);
 	safe_unpack16(&(tmp_ptr->port), buffer);
 	safe_unpackstr_xmalloc(&(tmp_ptr->host), &uint16_tmp, buffer);
+	safe_unpackstr_xmalloc(&(tmp_ptr->name), &uint16_tmp, buffer);
+	safe_unpackstr_xmalloc(&(tmp_ptr->network), &uint16_tmp, buffer);
 	safe_unpackstr_xmalloc(&(tmp_ptr->node_list), &uint16_tmp, buffer);
 
 	return SLURM_SUCCESS;
 
       unpack_error:
 	xfree(tmp_ptr->host);
+	xfree(tmp_ptr->name);
+	xfree(tmp_ptr->network);
 	xfree(tmp_ptr->node_list);
 	xfree(tmp_ptr);
 	*msg = NULL;
@@ -1572,7 +1578,8 @@ void
 pack_job_step_info_members(uint32_t job_id, uint16_t step_id,
 			   uint32_t user_id, uint32_t num_tasks,
 			   time_t start_time, char *partition, 
-			   char *nodes, Buf buffer)
+			   char *nodes, char *name, char *network,
+			   Buf buffer)
 {
 	pack32(job_id, buffer);
 	pack16(step_id, buffer);
@@ -1582,7 +1589,8 @@ pack_job_step_info_members(uint32_t job_id, uint16_t step_id,
 	pack_time(start_time, buffer);
 	packstr(partition, buffer);
 	packstr(nodes, buffer);
-
+	packstr(name, buffer);
+	packstr(network, buffer);
 }
 
 /* pack_job_step_info
@@ -1599,7 +1607,8 @@ pack_job_step_info(job_step_info_t * step, Buf buffer)
 				   step->user_id,
 				   step->num_tasks,
 				   step->start_time,
-				   step->partition, step->nodes, buffer);
+				   step->partition, step->nodes, 
+				   step->name, step->network, buffer);
 }
 
 /* _unpack_job_step_info_members
@@ -1621,12 +1630,16 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer)
 	safe_unpack_time(&step->start_time, buffer);
 	safe_unpackstr_xmalloc(&step->partition, &uint16_tmp, buffer);
 	safe_unpackstr_xmalloc(&step->nodes, &uint16_tmp, buffer);
+	safe_unpackstr_xmalloc(&step->name, &uint16_tmp, buffer);
+	safe_unpackstr_xmalloc(&step->network, &uint16_tmp, buffer);
 
 	return SLURM_SUCCESS;
 
       unpack_error:
 	xfree(step->partition);
 	xfree(step->nodes);
+	xfree(step->name);
+	xfree(step->network);
 	return SLURM_ERROR;
 }
 
diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h
index 3d5cfe58cb0..363abfd302e 100644
--- a/src/common/slurm_protocol_pack.h
+++ b/src/common/slurm_protocol_pack.h
@@ -124,6 +124,7 @@ void pack_job_step_info ( job_step_info_t* step, Buf buffer );
  */ 
 void pack_job_step_info_members( uint32_t job_id, uint16_t step_id, 
 		uint32_t user_id, uint32_t num_tasks, time_t start_time, 
-		char *partition, char *nodes, Buf buffer );
+		char *partition, char *nodes, char *name, char *network,
+		Buf buffer );
 
 #endif
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 2e4f818c5de..aca736508e2 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -72,7 +72,7 @@
 
 #define JOB_HASH_INX(_job_id)	(_job_id % hash_table_size)
 
-#define JOB_STATE_VERSION      "VER001"
+#define JOB_STATE_VERSION      "VER002"
 
 /* Global variables */
 List   job_list = NULL;		/* job_record list */
@@ -816,6 +816,8 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer)
 	pack_time(step_ptr->start_time, buffer);
 	packstr(step_ptr->host,  buffer);
 	packstr(step_ptr->step_node_list,  buffer);
+	packstr(step_ptr->name, buffer);
+	packstr(step_ptr->network, buffer);
 	pack16(step_ptr->batch_step, buffer);
 	if (!step_ptr->batch_step)
 		switch_pack_jobinfo(step_ptr->switch_job, buffer);
@@ -830,6 +832,7 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer)
 	uint32_t num_tasks;
 	time_t start_time;
 	char *step_node_list = NULL, *host = NULL;
+	char *name = NULL, *network = NULL;
 	switch_jobinfo_t switch_tmp = NULL;
 	check_jobinfo_t check_tmp = NULL;
 
@@ -840,6 +843,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer)
 	safe_unpack_time(&start_time, buffer);
 	safe_unpackstr_xmalloc(&host, &name_len, buffer);
 	safe_unpackstr_xmalloc(&step_node_list, &name_len, buffer);
+	safe_unpackstr_xmalloc(&name, &name_len, buffer);
+	safe_unpackstr_xmalloc(&network, &name_len, buffer);
 	safe_unpack16(&batch_step, buffer);
 	if (!batch_step) {
 		switch_alloc_jobinfo(&switch_tmp);
@@ -869,6 +874,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer)
 	/* set new values */
 	step_ptr->step_id      = step_id;
 	step_ptr->cyclic_alloc = cyclic_alloc;
+	step_ptr->name         = name;
+	step_ptr->network      = network;
 	step_ptr->num_tasks    = num_tasks;
 	step_ptr->port         = port;
 	step_ptr->host         = host;
@@ -885,6 +892,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer)
 
       unpack_error:
 	xfree(host);
+	xfree(name);
+	xfree(network);
 	xfree(step_node_list);
 	if (switch_tmp) switch_free_jobinfo(switch_tmp);
 	return SLURM_FAILURE;
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 8598dbf6b37..067911402b7 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -558,6 +558,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	req_step_msg.node_count = INFINITE;
 	req_step_msg.cpu_count  = job_desc_msg->num_procs;
 #endif
+	req_step_msg.name	= job_ptr->name;
+	req_step_msg.network	= job_ptr->network;
 	req_step_msg.num_tasks  = job_desc_msg->num_tasks;
 	req_step_msg.task_dist  = job_desc_msg->task_dist;
 	error_code = step_create(&req_step_msg, &step_rec, true, false);
@@ -2037,8 +2039,9 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid,
 	req_step_msg.task_dist = SLURM_DIST_CYCLIC;
 	req_step_msg.port = 0;
 	req_step_msg.host = NULL;
+	req_step_msg.name = NULL;
+	req_step_msg.network = NULL;
 	req_step_msg.node_list = NULL;
-	
 
 	START_TIMER;
 	lock_slurmctld(job_write_lock);
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 97ed8ad7225..14307de760a 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -349,9 +349,10 @@ struct 	step_record {
 	uint16_t batch_step;		/* 1 if batch job step, 0 otherwise */
 	switch_jobinfo_t switch_job;	/* switch context, opaque */
 	check_jobinfo_t check_job;	/* checkpoint context, opaque */
+	char *name;			/* name of job step */
+	char *network;			/* step's network specification */
 };
 
-typedef struct job_step_specs step_specs; 
 extern List job_list;			/* list of job_record entries */
 
  extern List job_list;                  /* list of job_record entries */
@@ -506,7 +507,7 @@ extern void dump_job_desc(job_desc_msg_t * job_specs);
  * dump_step_desc - dump the incoming step initiate request message
  * IN step_spec - job step request specification from RPC
  */
-extern void dump_step_desc(step_specs *step_spec);
+extern void dump_step_desc(job_step_create_request_msg_t *step_spec);
 
 /* 
  * find_job_record - return a pointer to the job record with the given job_id
@@ -1107,7 +1108,7 @@ extern int slurmctld_shutdown(void);
  * NOTE: don't free the returned step_record because that is managed through
  * 	the job.
  */
-extern int step_create ( step_specs *step_specs, 
+extern int step_create ( job_step_create_request_msg_t *step_specs, 
 			 struct step_record** new_step_record,
 			 bool kill_job_when_step_done,
 			 bool batch_step );
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index ab3558c6f26..701a8e4aa23 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -56,7 +56,7 @@
 static int _job_step_ckpt_error(struct step_record *step_ptr, slurm_fd conn_fd);
 static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer);
 static bitstr_t * _pick_step_nodes (struct job_record  *job_ptr, 
-				    step_specs *step_spec );
+				    job_step_create_request_msg_t *step_spec );
 /* 
  * create_step_record - create an empty step_record for the specified job.
  * IN job_ptr - pointer to job table entry to have step record added
@@ -166,7 +166,7 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id)
  * IN step_spec - job step request specification from RPC
  */
 void
-dump_step_desc(step_specs *step_spec)
+dump_step_desc(job_step_create_request_msg_t *step_spec)
 {
 	if (step_spec == NULL) 
 		return;
@@ -177,8 +177,9 @@ dump_step_desc(step_specs *step_spec)
 	debug3("   num_tasks=%u relative=%u task_dist=%u node_list=%s", 
 		step_spec->num_tasks, step_spec->relative, 
 		step_spec->task_dist, step_spec->node_list);
-	debug3("   host=%s port=%u", 
-		step_spec->host, step_spec->port);
+	debug3("   host=%s port=%u name=%s network=%s", 
+		step_spec->host, step_spec->port, step_spec->name,
+		step_spec->network);
 }
 
 
@@ -366,7 +367,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid,
  * NOTE: returned bitmap must be freed by the caller using bit_free()
  */
 static bitstr_t *
-_pick_step_nodes (struct job_record  *job_ptr, step_specs *step_spec )
+_pick_step_nodes (struct job_record  *job_ptr, 
+		job_step_create_request_msg_t *step_spec )
 {
 
 	bitstr_t *nodes_avail = NULL, *nodes_picked = NULL, *node_tmp = NULL;
@@ -488,9 +490,10 @@ cleanup:
  * NOTE: don't free the returned step_record because that is managed through
  * 	the job.
  */
-int
-step_create ( step_specs *step_specs, struct step_record** new_step_record,
-	      bool kill_job_when_step_done, bool batch_step )
+extern int
+step_create ( job_step_create_request_msg_t *step_specs, 
+		struct step_record** new_step_record,
+		bool kill_job_when_step_done, bool batch_step )
 {
 	struct step_record *step_ptr;
 	struct job_record  *job_ptr;
@@ -554,6 +557,17 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record,
 	step_ptr->host = xstrdup(step_specs->host);
 	step_ptr->batch_step = batch_step;
 
+	/* step's name and network default to job's values if not 
+	 * specified in the step specification */
+	if (step_specs->name && step_specs->name[0])
+		step_ptr->name = xstrdup(step_specs->name);
+	else
+		step_ptr->name = xstrdup(job_ptr->name);
+	if (step_specs->network && step_specs->network[0])
+		step_ptr->network = xstrdup(step_specs->network);
+	else
+		step_ptr->network = xstrdup(job_ptr->network);
+
 	/* a batch script does not need switch info */
 	if (!batch_step) {
 		int *tasks_per_node;
@@ -571,7 +585,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record,
 					step_ptr->step_node_list,
 					tasks_per_node, 
 					step_ptr->cyclic_alloc,
-					job_ptr->network) < 0) {
+					step_ptr->network) < 0) {
 			error("switch_build_jobinfo: %m");
 			xfree(tasks_per_node);
 			delete_step_record (job_ptr, step_ptr->step_id);
@@ -598,7 +612,8 @@ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer)
 				   step->num_tasks,
 				   step->start_time,
 				   step->job_ptr->partition,
-				   step->step_node_list, buffer);
+				   step->step_node_list, 
+				   step->name, step->network, buffer);
 }
 
 /* 
diff --git a/src/squeue/opts.c b/src/squeue/opts.c
index d5c822f3b21..03c6a5df19a 100644
--- a/src/squeue/opts.c
+++ b/src/squeue/opts.c
@@ -335,6 +335,12 @@ extern int parse_format( char* format )
 				step_format_add_id( params.format_list, 
 				                    field_size, 
 						    right_justify, suffix );
+			else if (field[0] == 'j')
+				step_format_add_name( params.format_list,
+							field_size,
+							right_justify,
+							suffix );
+
 			else if (field[0] == 'M')
 				step_format_add_time_used( params.format_list, 
 				                            field_size, 
diff --git a/src/squeue/print.c b/src/squeue/print.c
index 246bca7ce0b..23c9ee985fb 100644
--- a/src/squeue/print.c
+++ b/src/squeue/print.c
@@ -950,6 +950,18 @@ int _print_step_time_used(job_step_info_t * step, int width, bool right,
 	return SLURM_SUCCESS;
 }
 
+int _print_step_name(job_step_info_t * step, int width, bool right,
+			char* suffix)
+{
+	if (step == NULL)	/* Print the Header instead */
+		_print_str("NAME", width, right, true);
+	else
+		_print_nodes(step->name, width, right, true);
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
 int _print_step_nodes(job_step_info_t * step, int width, bool right, 
 		      char* suffix)
 {
diff --git a/src/squeue/print.h b/src/squeue/print.h
index 7c879fddea3..0d34ee54db9 100644
--- a/src/squeue/print.h
+++ b/src/squeue/print.h
@@ -236,6 +236,8 @@ int step_format_add_function(List list, int width, bool right_justify,
 	step_format_add_function(list,wid,right,suffix,_print_step_time_used)
 #define step_format_add_nodes(list,wid,right,suffix) \
 	step_format_add_function(list,wid,right,suffix,_print_step_nodes)
+#define step_format_add_name(list,wid,right,suffix) \
+	step_format_add_function(list,wid,right,suffix,_print_step_name)
 
 /*****************************************************************************
  * Step Line Print Functions
@@ -254,6 +256,8 @@ int _print_step_time_start(job_step_info_t * step, int width,
 			bool right_justify, char *suffix);
 int _print_step_time_used(job_step_info_t * step, int width,
 			bool right_justify, char *suffix);
+int _print_step_name(job_step_info_t * step, int width,
+			bool right_justify, char *suffix);
 int _print_step_nodes(job_step_info_t * step, int width,
 			bool right_justify, char *suffix);
 
diff --git a/src/squeue/squeue.c b/src/squeue/squeue.c
index 03b44545d2f..f7d05e94eac 100644
--- a/src/squeue/squeue.c
+++ b/src/squeue/squeue.c
@@ -197,7 +197,7 @@ _print_job_steps( void )
 		        (long) new_step_ptr->last_update);
 	
 	if (params.format == NULL)
-		params.format = "%10i %.9P %.8u %.9M %N";
+		params.format = "%10i %.8j %.9P %.8u %.9M %N";
 	if (params.format_list == NULL)
 		parse_format(params.format);
 
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 22e4a9131a9..6579b34a4b8 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -465,6 +465,8 @@ _step_req_create(srun_job_t *j)
 		                       : (opt.nprocs*opt.cpus_per_task);
 	r->num_tasks  = opt.nprocs;
 	r->node_list  = j->nodelist;
+	r->network    = opt.network;
+	r->name       = opt.job_name;
 	r->relative   = false;      /* XXX fix this oneday */
 
 	switch (opt.distribution) {
-- 
GitLab