From b0460a3dfdd20a20615717298b649db7db591603 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Thu, 27 Feb 2014 10:02:35 -0800
Subject: [PATCH] NRT - Fix to supply correct error messages to poe/pmd when a
 launch fails.

---
 NEWS                                       |  1 +
 src/plugins/switch/nrt/libpermapi/shr_64.c | 99 +++++++++++++---------
 2 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/NEWS b/NEWS
index 6e6e71aa321..7dfaf238be4 100644
--- a/NEWS
+++ b/NEWS
@@ -27,6 +27,7 @@ documents those changes that are of interest to users and admins.
  -- Fixed minor memory leak in backfill scheduler.
  -- MYSQL - Fixed memory leak when querying clusters.
  -- MYSQL - Fix when updating QOS on an association.
+ -- NRT - Fix to supply correct error messages to poe/pmd when a launch fails.
 
 * Changes in Slurm 2.6.6
 ========================
diff --git a/src/plugins/switch/nrt/libpermapi/shr_64.c b/src/plugins/switch/nrt/libpermapi/shr_64.c
index 4d03b7f65ab..099fbd20cac 100644
--- a/src/plugins/switch/nrt/libpermapi/shr_64.c
+++ b/src/plugins/switch/nrt/libpermapi/shr_64.c
@@ -89,6 +89,7 @@ static bool slurm_started = false;
 static log_options_t log_opts = LOG_OPTS_STDERR_ONLY;
 static host_usage_t *host_usage = NULL;
 static hostlist_t total_hl = NULL;
+static int err_msg_len = 400;
 
 int sig_array[] = {
 	SIGINT,  SIGQUIT, SIGCONT, SIGTERM, SIGHUP,
@@ -467,8 +468,8 @@ static char *_uint16_array_to_str(int array_len, const uint16_t *array)
 	for (i = 0; i < array_len; i++) {
 		if ((i+1 < array_len)
 		    && (array[i] == array[i+1])) {
-				previous++;
-				continue;
+			previous++;
+			continue;
 		}
 
 		if (i == array_len-1) /* last time through loop */
@@ -932,7 +933,9 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 		debug("got pe_rm_connect called");
 		launch_common_set_stdio_fds(job, &cio_fds);
 	} else {
-		*error_msg = xstrdup_printf("pe_rm_connect: unknown caller");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: unknown caller");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -945,9 +948,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 		if (!name) {
 			if (hl)
 				hostlist_destroy(hl);
-			*error_msg = xstrdup_printf(
-				"pe_rm_connect: unknown host for ip %s",
-				connect_param->machine_name[i]);
+			*error_msg = malloc(sizeof(char) * err_msg_len);
+			snprintf(*error_msg, err_msg_len,
+				 "pe_rm_connect: unknown host for ip %s",
+				 connect_param->machine_name[i]);
 			error("%s", *error_msg);
 			return -1;
 		}
@@ -964,10 +968,11 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 	}
 
 	if (!hl) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_connect: machine_count 0? it came in as "
-			"%d but we didn't get a hostlist",
-			connect_param->machine_count);
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: machine_count 0? it came in as "
+			 "%d but we didn't get a hostlist",
+			 connect_param->machine_count);
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -993,9 +998,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 						node_cnt, &task_num)
 	    != SLURM_SUCCESS) {
 		xfree(total_node_list);
-		*error_msg = xstrdup_printf(
-			"pe_rm_connect: problem with hack: %s",
-			slurm_strerror(errno));
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: problem with hack: %s",
+			 slurm_strerror(errno));
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1008,9 +1014,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 	step_callbacks.step_timeout  = _self_timeout;
 
 	if (launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_connect: problem with launch: %s",
-			slurm_strerror(errno));
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: problem with launch: %s",
+			 slurm_strerror(errno));
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1019,16 +1026,19 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
 				SLURM_STEP_CTX_USER_MANAGED_SOCKETS,
 				&fd_cnt, &ctx_sockfds);
 	if (ctx_sockfds == NULL) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_connect: Unable to get pmd IO socket array %d",
-			rc);
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: Unable to get pmd IO socket array %d",
+			 rc);
 		error("%s", *error_msg);
 		return -1;
 	}
 	if (fd_cnt != task_num) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_connect: looking for %d sockets but got back %d",
-			connect_param->machine_count, fd_cnt);
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_connect: looking for %d sockets but "
+			 "got back %d",
+			 connect_param->machine_count, fd_cnt);
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1169,7 +1179,9 @@ extern int pe_rm_get_event(rmhandle_t resource_mgr, job_event_t **job_event,
 		debug("pe_rm_get_event called");
 		return 0;
 	} else if (pm_type != PM_POE) {
-		*error_msg = xstrdup_printf("pe_rm_get_event: unknown caller");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_get_event: unknown caller");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1238,8 +1250,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
 		debug("pe_rm_get_job_info called");
 		return 0;
 	} else if (pm_type != PM_POE) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_get_job_info: unknown caller");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_get_job_info: unknown caller");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1266,8 +1279,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
 
 	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_RESP, &resp);
 	if (!resp) {
-		*error_msg = xstrdup_printf(
-			"pe_rm_get_job_info: no step response in step ctx");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_get_job_info: no step response in step ctx");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1653,7 +1667,7 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
 				xstrcat(opt.network, "bulk_xfer");
 			}
 			xfree(bulk_xfer);
-				xfree(collectives);
+			xfree(collectives);
 			xfree(euidevice);
 			xfree(euilib);
 			xfree(immediate);
@@ -1690,17 +1704,19 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
 		if ((srun_debug = getenv("SLURM_STEP_ID")))
 			step_id = atoi(srun_debug);
 		if (job_id == -1 || step_id == -1) {
-			*error_msg = xstrdup_printf(
-				"pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
-				"not found %d.%d", job_id, step_id);
+			*error_msg = malloc(sizeof(char) * err_msg_len);
+			snprintf(*error_msg, err_msg_len,
+				 "pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
+				 "not found %d.%d", job_id, step_id);
 			error("%s", *error_msg);
 			return -1;
 		}
 
 		job = _read_job_srun_agent();
 		if (!job) {
-			*error_msg = xstrdup_printf(
-				"pe_rm_init: no job created");
+			*error_msg = malloc(sizeof(char) * err_msg_len);
+			snprintf(*error_msg, err_msg_len,
+				 "pe_rm_init: no job created");
 			error("%s", *error_msg);
 			return -1;
 		}
@@ -1715,7 +1731,9 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
 		 * PMD to fanout child processes on other nodes */
 		_spawn_fe_agent();
 	} else {
-		*error_msg = xstrdup_printf("pe_rm_init: unknown caller");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_init: unknown caller");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1749,7 +1767,7 @@ extern int pe_rm_send_event(rmhandle_t resource_mgr, job_event_t *job_event,
 
 	if ((job_event->event == JOB_CKPT_COMPLETE) && job) {
 		struct ckpt_end_data *ckpt_end_ptr = (struct ckpt_end_data *)
-						     job_event->event_data;
+			job_event->event_data;
 		rc = slurm_checkpoint_complete(job->jobid, job->stepid,
 					       ckpt_end_ptr->ckpt_start_time,
 					       ckpt_end_ptr->ckpt_rc,
@@ -1800,7 +1818,9 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
 		if (slurm_cmd_fname)
 			poe_cmd_fname = getenv("MP_CMDFILE");
 	} else {
-		*error_msg = xstrdup_printf("pe_rm_submit_job: unknown caller");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_submit_job: unknown caller");
 		error("%s", *error_msg);
 		return -1;
 	}
@@ -1808,9 +1828,10 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
 	debug("got pe_rm_submit_job called %d", job_cmd.job_format);
 	if (job_cmd.job_format != 1) {
 		/* We don't handle files */
-		*error_msg = xstrdup_printf(
-			"pe_rm_submit_job: SLURM doesn't handle files "
-			"to submit_job");
+		*error_msg = malloc(sizeof(char) * err_msg_len);
+		snprintf(*error_msg, err_msg_len,
+			 "pe_rm_submit_job: SLURM doesn't handle files "
+			 "to submit_job");
 		error("%s", *error_msg);
 		return -1;
 	}
-- 
GitLab