diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a76176d1d9cd07f19ae36e92accfe37e72891071..693aa9b018615fd66558a885c33d419abd8e536b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -12126,6 +12126,16 @@ extern void job_post_resize_acctg(struct job_record *job_ptr) job_ptr->job_state &= (~JOB_RESIZING); } +static char *_build_step_id(char *buf, int buf_len, + uint32_t job_id, uint32_t step_id) +{ + if (step_id == SLURM_BATCH_SCRIPT) + snprintf(buf, buf_len, "%u.batch", job_id); + else + snprintf(buf, buf_len, "%u.%u", job_id, step_id); + return buf; +} + /* * validate_jobs_on_node - validate that any jobs that should be on the node * are actually running, if not clean up the job records and/or node @@ -12140,6 +12150,7 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) struct node_record *node_ptr; struct job_record *job_ptr; struct step_record *step_ptr; + char step_str[64]; time_t now = time(NULL); node_ptr = find_node_record(reg_msg->node_name); @@ -12173,8 +12184,10 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) for (i = 0; i < reg_msg->job_count; i++) { if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) && (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) { - info("NoAllocate job %u.%u reported on node %s", - reg_msg->job_id[i], reg_msg->step_id[i], + info("NoAllocate job %s reported on node %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), reg_msg->node_name); continue; } @@ -12183,8 +12196,10 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) job_ptr = find_job_record(reg_msg->job_id[i]); if (job_ptr == NULL) { - error("Orphan job %u.%u reported on node %s", - reg_msg->job_id[i], reg_msg->step_id[i], + error("Orphan job %s reported on node %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), reg_msg->node_name); abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr->name); @@ -12193,9 +12208,11 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { if (bit_test(job_ptr->node_bitmap, node_inx)) { - debug3("Registered job %u.%u on node %s ", - reg_msg->job_id[i], - reg_msg->step_id[i], + debug3("Registered job %s on node %s ", + _build_step_id(step_str, + sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), reg_msg->node_name); if ((job_ptr->batch_flag) && (node_inx == bit_ffs( @@ -12213,9 +12230,11 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) /* Typically indicates a job requeue and * restart on another nodes. A node from the * original allocation just responded here. */ - error("Registered job %u.%u on wrong node %s ", - reg_msg->job_id[i], - reg_msg->step_id[i], + error("Registered job %s on wrong node %s ", + _build_step_id(step_str, + sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), reg_msg->node_name); info("%s: job nodes %s count %d inx %d", __func__, job_ptr->nodes, @@ -12236,8 +12255,10 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) else if (IS_JOB_PENDING(job_ptr)) { /* Typically indicates a job requeue and the hung * slurmd that went DOWN is now responding */ - error("Registered PENDING job %u.%u on node %s ", - reg_msg->job_id[i], reg_msg->step_id[i], + error("Registered PENDING job %s on node %s ", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), reg_msg->node_name); abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr->name); @@ -12245,14 +12266,18 @@ validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) else if (difftime(now, job_ptr->end_time) < slurm_get_msg_timeout()) { /* Race condition */ - debug("Registered newly completed job %u.%u on %s", - reg_msg->job_id[i], reg_msg->step_id[i], - node_ptr->name); + debug("Registered newly completed job %s on %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), + node_ptr->name); } else { /* else job is supposed to be done */ - error("Registered job %u.%u in state %s on node %s ", - reg_msg->job_id[i], reg_msg->step_id[i], + error("Registered job %s in state %s on node %s ", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), job_state_string(job_ptr->job_state), reg_msg->node_name); kill_job_on_node(reg_msg->job_id[i], job_ptr, diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 98a718de1c5a1879eabc0c0a5ab8677c874b7596..0f5d10afed6092a3b397f0834337eecf7665779e 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -2510,6 +2510,16 @@ static front_end_record_t * _front_end_reg( return front_end_ptr; } +static char *_build_step_id(char *buf, int buf_len, + uint32_t job_id, uint32_t step_id) +{ + if (step_id == SLURM_BATCH_SCRIPT) + snprintf(buf, buf_len, "%u.batch", job_id); + else + snprintf(buf, buf_len, "%u.%u", job_id, step_id); + return buf; +} + /* * validate_nodes_via_front_end - validate all nodes on a cluster as having * a valid configuration as soon as the front-end registers. Individual @@ -2535,6 +2545,7 @@ extern int validate_nodes_via_front_end( char *host_str = NULL, *reason_down = NULL; uint32_t node_flags; front_end_record_t *front_end_ptr; + char step_str[64]; if (reg_msg->up_time > now) { error("Node up_time on %s is invalid: %u>%u", @@ -2564,8 +2575,10 @@ extern int validate_nodes_via_front_end( for (i = 0; i < reg_msg->job_count; i++) { if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) && (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) { - info("NoAllocate job %u.%u reported", - reg_msg->job_id[i], reg_msg->step_id[i]); + info("NoAllocate job %s reported", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i])); continue; } @@ -2576,8 +2589,10 @@ extern int validate_nodes_via_front_end( node_ptr += j; if (job_ptr == NULL) { - error("Orphan job %u.%u reported on %s", - reg_msg->job_id[i], reg_msg->step_id[i], + error("Orphan job %s reported on node %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), front_end_ptr->name); abort_job_on_node(reg_msg->job_id[i], job_ptr, front_end_ptr->name); @@ -2590,8 +2605,10 @@ extern int validate_nodes_via_front_end( if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { - debug3("Registered job %u.%u on %s", - reg_msg->job_id[i], reg_msg->step_id[i], + debug3("Registered job %s on %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), front_end_ptr->name); if (job_ptr->batch_flag) { /* NOTE: Used for purging defunct batch jobs */ @@ -2609,8 +2626,10 @@ extern int validate_nodes_via_front_end( else if (IS_JOB_PENDING(job_ptr)) { /* Typically indicates a job requeue and the hung * slurmd that went DOWN is now responding */ - error("Registered PENDING job %u.%u on %s", - reg_msg->job_id[i], reg_msg->step_id[i], + error("Registered PENDING job %s on %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), front_end_ptr->name); abort_job_on_node(reg_msg->job_id[i], job_ptr, front_end_ptr->name); @@ -2618,16 +2637,20 @@ extern int validate_nodes_via_front_end( else if (difftime(now, job_ptr->end_time) < slurm_get_msg_timeout()) { /* Race condition */ - debug("Registered newly completed job %u.%u on %s", - reg_msg->job_id[i], reg_msg->step_id[i], - front_end_ptr->name); + debug("Registered newly completed job %s on %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), + front_end_ptr->name); } else { /* else job is supposed to be done */ - error("Registered job %u.%u in state %s on %s", - reg_msg->job_id[i], reg_msg->step_id[i], - job_state_string(job_ptr->job_state), - front_end_ptr->name); + error("Registered job %s in state %s on %s", + _build_step_id(step_str, sizeof(step_str), + reg_msg->job_id[i], + reg_msg->step_id[i]), + job_state_string(job_ptr->job_state), + front_end_ptr->name); kill_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr); }