Skip to content
Snippets Groups Projects
Commit b0460a3d authored by Danny Auble's avatar Danny Auble
Browse files

NRT - Fix to supply correct error messages to poe/pmd when a launch fails.

parent 324a089e
No related branches found
No related tags found
No related merge requests found
......@@ -27,6 +27,7 @@ documents those changes that are of interest to users and admins.
-- Fixed minor memory leak in backfill scheduler.
-- MYSQL - Fixed memory leak when querying clusters.
-- MYSQL - Fix when updating QOS on an association.
-- NRT - Fix to supply correct error messages to poe/pmd when a launch fails.
* Changes in Slurm 2.6.6
========================
......
......@@ -89,6 +89,7 @@ static bool slurm_started = false;
static log_options_t log_opts = LOG_OPTS_STDERR_ONLY;
static host_usage_t *host_usage = NULL;
static hostlist_t total_hl = NULL;
static int err_msg_len = 400;
int sig_array[] = {
SIGINT, SIGQUIT, SIGCONT, SIGTERM, SIGHUP,
......@@ -467,8 +468,8 @@ static char *_uint16_array_to_str(int array_len, const uint16_t *array)
for (i = 0; i < array_len; i++) {
if ((i+1 < array_len)
&& (array[i] == array[i+1])) {
previous++;
continue;
previous++;
continue;
}
if (i == array_len-1) /* last time through loop */
......@@ -932,7 +933,9 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
debug("got pe_rm_connect called");
launch_common_set_stdio_fds(job, &cio_fds);
} else {
*error_msg = xstrdup_printf("pe_rm_connect: unknown caller");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: unknown caller");
error("%s", *error_msg);
return -1;
}
......@@ -945,9 +948,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
if (!name) {
if (hl)
hostlist_destroy(hl);
*error_msg = xstrdup_printf(
"pe_rm_connect: unknown host for ip %s",
connect_param->machine_name[i]);
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: unknown host for ip %s",
connect_param->machine_name[i]);
error("%s", *error_msg);
return -1;
}
......@@ -964,10 +968,11 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
}
if (!hl) {
*error_msg = xstrdup_printf(
"pe_rm_connect: machine_count 0? it came in as "
"%d but we didn't get a hostlist",
connect_param->machine_count);
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: machine_count 0? it came in as "
"%d but we didn't get a hostlist",
connect_param->machine_count);
error("%s", *error_msg);
return -1;
}
......@@ -993,9 +998,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
node_cnt, &task_num)
!= SLURM_SUCCESS) {
xfree(total_node_list);
*error_msg = xstrdup_printf(
"pe_rm_connect: problem with hack: %s",
slurm_strerror(errno));
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: problem with hack: %s",
slurm_strerror(errno));
error("%s", *error_msg);
return -1;
}
......@@ -1008,9 +1014,10 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
step_callbacks.step_timeout = _self_timeout;
if (launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) {
*error_msg = xstrdup_printf(
"pe_rm_connect: problem with launch: %s",
slurm_strerror(errno));
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: problem with launch: %s",
slurm_strerror(errno));
error("%s", *error_msg);
return -1;
}
......@@ -1019,16 +1026,19 @@ extern int pe_rm_connect(rmhandle_t resource_mgr,
SLURM_STEP_CTX_USER_MANAGED_SOCKETS,
&fd_cnt, &ctx_sockfds);
if (ctx_sockfds == NULL) {
*error_msg = xstrdup_printf(
"pe_rm_connect: Unable to get pmd IO socket array %d",
rc);
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: Unable to get pmd IO socket array %d",
rc);
error("%s", *error_msg);
return -1;
}
if (fd_cnt != task_num) {
*error_msg = xstrdup_printf(
"pe_rm_connect: looking for %d sockets but got back %d",
connect_param->machine_count, fd_cnt);
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_connect: looking for %d sockets but "
"got back %d",
connect_param->machine_count, fd_cnt);
error("%s", *error_msg);
return -1;
}
......@@ -1169,7 +1179,9 @@ extern int pe_rm_get_event(rmhandle_t resource_mgr, job_event_t **job_event,
debug("pe_rm_get_event called");
return 0;
} else if (pm_type != PM_POE) {
*error_msg = xstrdup_printf("pe_rm_get_event: unknown caller");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_get_event: unknown caller");
error("%s", *error_msg);
return -1;
}
......@@ -1238,8 +1250,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
debug("pe_rm_get_job_info called");
return 0;
} else if (pm_type != PM_POE) {
*error_msg = xstrdup_printf(
"pe_rm_get_job_info: unknown caller");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_get_job_info: unknown caller");
error("%s", *error_msg);
return -1;
}
......@@ -1266,8 +1279,9 @@ extern int pe_rm_get_job_info(rmhandle_t resource_mgr, job_info_t **job_info,
slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_RESP, &resp);
if (!resp) {
*error_msg = xstrdup_printf(
"pe_rm_get_job_info: no step response in step ctx");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_get_job_info: no step response in step ctx");
error("%s", *error_msg);
return -1;
}
......@@ -1653,7 +1667,7 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
xstrcat(opt.network, "bulk_xfer");
}
xfree(bulk_xfer);
xfree(collectives);
xfree(collectives);
xfree(euidevice);
xfree(euilib);
xfree(immediate);
......@@ -1690,17 +1704,19 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
if ((srun_debug = getenv("SLURM_STEP_ID")))
step_id = atoi(srun_debug);
if (job_id == -1 || step_id == -1) {
*error_msg = xstrdup_printf(
"pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
"not found %d.%d", job_id, step_id);
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_init: SLURM_JOB_ID or SLURM_STEP_ID "
"not found %d.%d", job_id, step_id);
error("%s", *error_msg);
return -1;
}
job = _read_job_srun_agent();
if (!job) {
*error_msg = xstrdup_printf(
"pe_rm_init: no job created");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_init: no job created");
error("%s", *error_msg);
return -1;
}
......@@ -1715,7 +1731,9 @@ extern int pe_rm_init(int *rmapi_version, rmhandle_t *resource_mgr, char *rm_id,
* PMD to fanout child processes on other nodes */
_spawn_fe_agent();
} else {
*error_msg = xstrdup_printf("pe_rm_init: unknown caller");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_init: unknown caller");
error("%s", *error_msg);
return -1;
}
......@@ -1749,7 +1767,7 @@ extern int pe_rm_send_event(rmhandle_t resource_mgr, job_event_t *job_event,
if ((job_event->event == JOB_CKPT_COMPLETE) && job) {
struct ckpt_end_data *ckpt_end_ptr = (struct ckpt_end_data *)
job_event->event_data;
job_event->event_data;
rc = slurm_checkpoint_complete(job->jobid, job->stepid,
ckpt_end_ptr->ckpt_start_time,
ckpt_end_ptr->ckpt_rc,
......@@ -1800,7 +1818,9 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
if (slurm_cmd_fname)
poe_cmd_fname = getenv("MP_CMDFILE");
} else {
*error_msg = xstrdup_printf("pe_rm_submit_job: unknown caller");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_submit_job: unknown caller");
error("%s", *error_msg);
return -1;
}
......@@ -1808,9 +1828,10 @@ int pe_rm_submit_job(rmhandle_t resource_mgr, job_command_t job_cmd,
debug("got pe_rm_submit_job called %d", job_cmd.job_format);
if (job_cmd.job_format != 1) {
/* We don't handle files */
*error_msg = xstrdup_printf(
"pe_rm_submit_job: SLURM doesn't handle files "
"to submit_job");
*error_msg = malloc(sizeof(char) * err_msg_len);
snprintf(*error_msg, err_msg_len,
"pe_rm_submit_job: SLURM doesn't handle files "
"to submit_job");
error("%s", *error_msg);
return -1;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment