From e12b5c80e8efd81fe1c226719a42b3363bf04f18 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 19 Nov 2002 18:18:35 +0000 Subject: [PATCH] If job's return code is non-zero, set state to FAILED on completion. --- src/slurmctld/controller.c | 98 ++++++++++---------------------------- src/slurmctld/job_mgr.c | 8 +++- src/slurmctld/slurmctld.h | 14 +++++- 3 files changed, 44 insertions(+), 76 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b333b59e55d..27eeca1a18b 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -911,13 +911,11 @@ static void _slurm_rpc_job_step_cancel(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; - uid_t uid = 0; + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_CANCEL_JOB_STEP"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); -#endif lock_slurmctld(job_write_lock); /* do RPC call */ @@ -927,7 +925,9 @@ static void _slurm_rpc_job_step_cancel(slurm_msg_t * msg) /* return result */ if (error_code) { - info("_slurm_rpc_job_step_cancel error %d for %u, time=%ld", error_code, job_step_id_msg->job_id, (long) (clock() - start_time)); + info("_slurm_rpc_job_step_cancel error %d for %u, time=%ld", + error_code, job_step_id_msg->job_id, + (long) (clock() - start_time)); slurm_send_rc_msg(msg, error_code); } else { info( @@ -981,15 +981,13 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; - uid_t uid = 0; + uid_t uid; bool job_requeue = false; /* init */ start_time = clock(); debug("Processing RPC: REQUEST_COMPLETE_JOB_STEP"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); -#endif lock_slurmctld(job_write_lock); /* do RPC call */ /* First set node down as needed on fatal error */ @@ -998,13 +996,11 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) complete_job_step_msg->job_id, complete_job_step_msg->node_name, slurm_strerror (complete_job_step_msg->slurm_rc)); -#ifdef HAVE_AUTHD if ((uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; error("Security violation, uid %u can't set node down", (unsigned int) uid); } -#endif if (error_code == SLURM_SUCCESS) { update_node_msg_t update_node_msg; update_node_msg.node_names = @@ -1019,7 +1015,8 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) /* Mark job and/or job step complete */ if (complete_job_step_msg->job_step_id == NO_VAL) { error_code = job_complete(complete_job_step_msg->job_id, - uid, job_requeue); + uid, job_requeue, + complete_job_step_msg->job_rc); unlock_slurmctld(job_write_lock); /* return result */ @@ -1135,7 +1132,7 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; - uid_t uid = 0; + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_UPDATE_JOB"); @@ -1143,9 +1140,7 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) unlock_slurmctld(job_write_lock); /* do RPC call */ -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); -#endif error_code = update_job(job_desc_msg, uid); /* return result */ @@ -1179,20 +1174,16 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg) slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_UPDATE_NODE"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; error("Security violation, UPDATE_NODE RPC from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { /* do RPC call */ @@ -1234,13 +1225,10 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg) slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK }; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_UPDATE_PARTITION"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; @@ -1248,7 +1236,6 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg) "Security violation, UPDATE_PARTITION RPC from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { /* do RPC call */ @@ -1265,7 +1252,8 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg) (long) (clock() - start_time)); slurm_send_rc_msg(msg, error_code); } else { - info("_slurm_rpc_update_partition complete for partition %s, time=%ld", part_desc_ptr->name, (long) (clock() - start_time)); + info("_slurm_rpc_update_partition complete for partition %s, time=%ld", + part_desc_ptr->name, (long) (clock() - start_time)); slurm_send_rc_msg(msg, SLURM_SUCCESS); /* NOTE: These functions provide their own locks */ @@ -1289,14 +1277,13 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; - uid_t uid = 0; + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_SUBMIT_BATCH_JOB"); /* do RPC call */ dump_job_desc(job_desc_msg); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != job_desc_msg->user_id) && (uid != 0) && (uid != getuid())) { @@ -1304,7 +1291,6 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) error("Security violation, SUBMIT_JOB from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { lock_slurmctld(job_write_lock); error_code = job_allocate(job_desc_msg, &job_id, @@ -1352,20 +1338,18 @@ void _slurm_rpc_allocate_resources(slurm_msg_t * msg, uint8_t immediate) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; - uid_t uid = 0; + uid_t uid; uint16_t node_cnt; slurm_addr *node_addr; start_time = clock(); if (immediate) - debug - ("Processing RPC: REQUEST_IMMEDIATE_RESOURCE_ALLOCATION"); + debug("Processing RPC: REQUEST_IMMEDIATE_RESOURCE_ALLOCATION"); else debug("Processing RPC: REQUEST_RESOURCE_ALLOCATION"); /* do RPC call */ dump_job_desc(job_desc_msg); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != job_desc_msg->user_id) && (uid != 0) && (uid != getuid())) { @@ -1373,7 +1357,6 @@ void _slurm_rpc_allocate_resources(slurm_msg_t * msg, uint8_t immediate) error("Security violation, RESOURCE_ALLOCATE from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { lock_slurmctld(job_write_lock); error_code = job_allocate(job_desc_msg, &job_id, @@ -1434,7 +1417,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; - uid_t uid = 0; + uid_t uid; uint16_t node_cnt; slurm_addr *node_addr; @@ -1443,7 +1426,6 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) /* do RPC call */ dump_job_desc(job_desc_msg); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != job_desc_msg->user_id) && (uid != 0) && (uid != getuid())) { @@ -1452,7 +1434,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) ("Security violation, ALLOCATE_AND_RUN RPC from uid %u", (unsigned int) uid); } -#endif + if (error_code == 0) { lock_slurmctld(job_write_lock); error_code = job_allocate(job_desc_msg, &job_id, @@ -1531,22 +1513,18 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg) }; uint16_t node_cnt; slurm_addr *node_addr; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_OLD_JOB_RESOURCE_ALLOCATION"); /* do RPC call */ -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != job_desc_msg->uid) && (uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; error("Security violation, RESOURCE_ALLOCATE from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { lock_slurmctld(job_read_lock); error_code = old_job_info(job_desc_msg->uid, @@ -1603,14 +1581,13 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; - uid_t uid = 0; + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_JOB_WILL_RUN"); /* do RPC call */ dump_job_desc(job_desc_msg); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != job_desc_msg->user_id) && (uid != 0) && (uid != getuid())) { @@ -1618,7 +1595,6 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) error("Security violation, JOB_WILL_RUN RPC from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { lock_slurmctld(job_write_lock); @@ -1647,18 +1623,14 @@ static void _slurm_rpc_ping(slurm_msg_t * msg) { /* init */ int error_code = SLURM_SUCCESS; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { error("Security violation, PING RPC from uid %u", (unsigned int) uid); error_code = ESLURM_USER_ID_MISSING; } -#endif /* return result */ slurm_send_rc_msg(msg, error_code); @@ -1676,20 +1648,16 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg) slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_RECONFIGURE"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { error("Security violation, RECONFIGURE RPC from uid %u", (unsigned int) uid); error_code = ESLURM_USER_ID_MISSING; } -#endif /* do RPC call */ if (error_code == 0) { @@ -1729,8 +1697,7 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) int error_code = 0, i; uint16_t core_arg = 0; shutdown_msg_t *shutdown_msg = (shutdown_msg_t *) msg->data; -#ifdef HAVE_AUTHD - uid_t uid = 0; + uid_t uid; uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { @@ -1738,7 +1705,6 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) (unsigned int) uid); error_code = ESLURM_USER_ID_MISSING; } -#endif if (error_code); else if (msg->msg_type == REQUEST_CONTROL) { info("Performing RPC: REQUEST_CONTROL"); @@ -1780,8 +1746,7 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg) { int error_code = 0; -#ifdef HAVE_AUTHD - uid_t uid = 0; + uid_t uid; uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { @@ -1790,7 +1755,6 @@ static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg) (unsigned int) uid); error_code = ESLURM_USER_ID_MISSING; } -#endif /* do RPC call */ /* No op: just used to knock loose accept RPC thread */ @@ -1815,14 +1779,11 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: REQUEST_JOB_STEP_CREATE"); dump_step_desc(req_step_msg); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != req_step_msg->user_id) && (uid != 0) && (uid != getuid())) { @@ -1831,7 +1792,6 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) ("Security violation, JOB_STEP_CREATE RPC from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { /* issue the RPC */ @@ -1885,20 +1845,16 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; -#ifdef HAVE_AUTHD - uid_t uid = 0; -#endif + uid_t uid; start_time = clock(); debug("Processing RPC: MESSAGE_NODE_REGISTRATION_STATUS"); -#ifdef HAVE_AUTHD uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; error("Security violation, NODE_REGISTER RPC from uid %u", (unsigned int) uid); } -#endif if (error_code == 0) { /* do RPC call */ lock_slurmctld(job_write_lock); @@ -2268,8 +2224,7 @@ void *background_rpc_mgr(void *no_data) int background_process_msg(slurm_msg_t * msg) { int error_code = 0; -#ifdef HAVE_AUTHD - uid_t uid = 0; + uid_t uid; uid = slurm_auth_uid(msg->cred); if ((uid != 0) && (uid != getuid())) { @@ -2277,7 +2232,6 @@ int background_process_msg(slurm_msg_t * msg) (unsigned int) uid); error_code = ESLURM_USER_ID_MISSING; } -#endif if (error_code == 0) { if (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b38ba02af4f..977b4f50074 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1089,12 +1089,13 @@ job_cancel (uint32_t job_id, uid_t uid) * IN job_id - id of the job which completed * IN uid - user id of user issuing the RPC * IN requeue - job should be run again if possible + * IN job_return_code - job's return code, if set then set state to JOB_FAILED * RET - 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ int -job_complete (uint32_t job_id, uid_t uid, bool requeue) +job_complete (uint32_t job_id, uid_t uid, bool requeue, uint32_t job_return_code) { struct job_record *job_ptr; @@ -1133,7 +1134,10 @@ job_complete (uint32_t job_id, uid_t uid, bool requeue) job_ptr->job_state = JOB_PENDING; info ("Requeing job %u", job_ptr->job_id); } else { - job_ptr->job_state = JOB_COMPLETE; + if (job_return_code) + job_ptr->job_state = JOB_FAILED; + else + job_ptr->job_state = JOB_COMPLETE; job_ptr->end_time = time(NULL); delete_job_details(job_ptr); delete_all_step_records(job_ptr); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 6df5abe9a60..dac3f8b1fbc 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -375,8 +375,18 @@ extern int job_cancel (uint32_t job_id, uid_t uid); /* job_step_cancel - cancel the specified job step */ extern int job_step_cancel (uint32_t job_id, uint32_t job_step_id, uid_t uid ); -/* job_complete - note the completion the specified job */ -extern int job_complete (uint32_t job_id, uid_t uid, bool requeue); +/* + * job_complete - note the normal termination the specified job + * IN job_id - id of the job which completed + * IN uid - user id of user issuing the RPC + * IN requeue - job should be run again if possible + * IN job_return_code - job's return code, if set then set state to JOB_FAILED + * RET - 0 on success, otherwise ESLURM error code + * global: job_list - pointer global job list + * last_job_update - time of last job table update + */ +extern int job_complete (uint32_t job_id, uid_t uid, bool requeue, + uint32_t job_return_code); /* job_step_complete - note the completion the specified job step*/ extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, uid_t uid); -- GitLab