diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 4db83c25ffccc8c05942485e933cd0e8c520843f..dab5152f4bf556ae1c5a80da9b0e9803a52fad89 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -238,7 +238,7 @@ int main(int argc, char *argv[]) _run_backup(); else if (strcmp(node_name, slurmctld_conf.control_machine)) fatal - ("this machine (%s) is not the primary (%s) or backup (%s) controller", + ("this host (%s) not valid controller (%s or %s)", node_name, slurmctld_conf.control_machine, slurmctld_conf.backup_controller); else /* primary tells secondary to shutdown */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f02799b811cc045e2f38f45a72c15278fdf7ff48..a0e93a755bfae0fc61c75535665460a36ae8bd24 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -883,7 +883,7 @@ int kill_running_job_by_node_name(char *node_name) == 0) continue; /* job not on this node */ - error("Running job_id %u on failed node node %s", + error("Running job_id %u on failed node %s", job_record_point->job_id, node_name); job_count++; if ((job_record_point->details == NULL) || @@ -2511,13 +2511,12 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, return; } - /* Ensure that jobs which are running are really supposed to be there */ + /* Ensure that jobs running are really supposed to be there */ for (i = 0; i < *job_count; i++) { job_ptr = find_job_record(job_id_ptr[i]); if (job_ptr == NULL) { - /* FIXME: In the future try to let job run */ - error("Orphan job_id %u reported on node %s", - job_id_ptr[i], node_name); + error("Orphan job %u.%u reported on node %s", + job_id_ptr[i], step_id_ptr[i], node_name); _signal_job_on_node(job_id_ptr[i], step_id_ptr[i], SIGKILL, node_name); /* We may well have pending purge job RPC to send @@ -2527,12 +2526,13 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, else if (job_ptr->job_state == JOB_RUNNING) { if (bit_test(job_ptr->node_bitmap, node_inx)) { jobs_running++; - debug3("Registered job_id %u on node %s ", - job_id_ptr[i], node_name); + debug3("Registered job %u.%u on node %s ", + job_id_ptr[i], step_id_ptr[i], + node_name); } else { error - ("REGISTERED JOB_ID %u ON WRONG NODE %s ", - job_id_ptr[i], node_name); + ("REGISTERED JOB %u.u ON WRONG NODE %s ", + job_id_ptr[i], step_id_ptr[i], node_name); _signal_job_on_node(job_id_ptr[i], step_id_ptr[i], SIGKILL, node_name); @@ -2540,9 +2540,9 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, } else if (job_ptr->job_state == JOB_PENDING) { - /* FIXME: In the future try to let job run */ - error("REGISTERED PENDING JOB_ID %u ON NODE %s ", - job_id_ptr[i], node_name); + error("REGISTERED PENDING JOB %u.%u ON NODE %s ", + job_id_ptr[i], step_id_ptr[i], node_name); + error("CODE DEVELOPMENT NEEDED HERE"); job_ptr->job_state = JOB_FAILED; last_job_update = time(NULL); job_ptr->end_time = time(NULL); @@ -2553,8 +2553,8 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, else { /* else job is supposed to be done */ error - ("Registered job_id %u in state %s on node %s ", - job_id_ptr[i], + ("Registered job %u.%u in state %s on node %s ", + job_id_ptr[i], step_id_ptr[i], job_state_string(job_ptr->job_state), node_name); _signal_job_on_node(job_id_ptr[i], step_id_ptr[i],