diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 6e7bb0e99a5c8784e28a3d2262018ae12a7c26b4..b333b59e55df145f8ba5ba00a98087456266c208 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -45,8 +45,10 @@ #include <netinet/in.h> #include <unistd.h> +#include "src/common/credential_utils.h" #include "src/common/hostlist.h" #include "src/common/log.h" +#include "src/common/macros.h" #include "src/common/pack.h" #include "src/common/read_config.h" #include "src/common/slurm_auth.h" @@ -54,10 +56,10 @@ #include "src/common/slurm_protocol_api.h" #include "src/common/macros.h" #include "src/common/xstring.h" + #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/slurmctld.h" -#include "src/common/credential_utils.h" #define BUF_SIZE 1024 #define DEFAULT_DAEMONIZE 0 @@ -980,6 +982,7 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) WRITE_LOCK, NO_LOCK }; uid_t uid = 0; + bool job_requeue = false; /* init */ start_time = clock(); @@ -990,16 +993,15 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) lock_slurmctld(job_write_lock); /* do RPC call */ /* First set node down as needed on fatal error */ - if ((complete_job_step_msg->job_rc != SLURM_SUCCESS) && - (complete_job_step_msg->slurm_rc != SLURM_SUCCESS)) { - error ("Fatal error running job %u from node %s: %s", + if (complete_job_step_msg->slurm_rc != SLURM_SUCCESS) { + error ("Fatal slurmd error running job %u from node %s: %s", complete_job_step_msg->job_id, complete_job_step_msg->node_name, slurm_strerror (complete_job_step_msg->slurm_rc)); #ifdef HAVE_AUTHD if ((uid != 0) && (uid != getuid())) { error_code = ESLURM_USER_ID_MISSING; - error("Security violation, can't set node down uid %u", + error("Security violation, uid %u can't set node down", (unsigned int) uid); } #endif @@ -1009,13 +1011,15 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg) complete_job_step_msg->node_name; update_node_msg.node_state = NODE_STATE_DOWN; error_code = update_node ( &update_node_msg ); - /* FIXME: Release resources and requeue the job */ + if (complete_job_step_msg->job_rc != SLURM_SUCCESS) + job_requeue = true; } } /* Mark job and/or job step complete */ if (complete_job_step_msg->job_step_id == NO_VAL) { - error_code = job_complete(complete_job_step_msg->job_id, uid); + error_code = job_complete(complete_job_step_msg->job_id, + uid, job_requeue); unlock_slurmctld(job_write_lock); /* return result */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index c9f60aa8b901541c418ece3e869dd591150b9c47..b38ba02af4f092955d0b3778812655f360f645d3 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -28,7 +28,7 @@ \*****************************************************************************/ #ifdef HAVE_CONFIG_H -# include <config.h> +# include "config.h" #endif #include <ctype.h> @@ -41,22 +41,23 @@ #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> + #ifdef HAVE_LIBELAN3 -#include <elan3/elan3.h> -#include <elan3/elanvp.h> -#include <src/common/qsw.h> -#define BUF_SIZE (1024 + QSW_PACK_SIZE) +# include <elan3/elan3.h> +# include <elan3/elanvp.h> +# define BUF_SIZE (1024 + QSW_PACK_SIZE) #else -#define BUF_SIZE 1024 +# define BUF_SIZE 1024 #endif -#include <src/common/list.h> -#include <src/common/macros.h> -#include <src/common/pack.h> -#include <src/common/slurm_errno.h> -#include <src/common/xstring.h> -#include <src/slurmctld/locks.h> -#include <src/slurmctld/slurmctld.h> +#include "src/common/list.h" +#include "src/common/macros.h" +#include "src/common/pack.h" +#include "src/common/slurm_errno.h" +#include "src/common/xstring.h" + +#include "src/slurmctld/locks.h" +#include "src/slurmctld/slurmctld.h" #include <src/common/credential_utils.h> slurm_ssl_key_ctx_t sign_ctx ; @@ -1085,14 +1086,15 @@ job_cancel (uint32_t job_id, uid_t uid) /* * job_complete - note the normal termination the specified job - * input: job_id - id of the job which completed - * uid - user id of user issuing the RPC - * output: returns 0 on success, otherwise ESLURM error code + * IN job_id - id of the job which completed + * IN uid - user id of user issuing the RPC + * IN requeue - job should be run again if possible + * RET - 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ int -job_complete (uint32_t job_id, uid_t uid) +job_complete (uint32_t job_id, uid_t uid, bool requeue) { struct job_record *job_ptr; @@ -1109,7 +1111,8 @@ job_complete (uint32_t job_id, uid_t uid) if ( (job_ptr->user_id != uid) && (uid != 0) && (uid != getuid ()) ) { - error ("Security violation, JOB_COMPLETE RPC from uid %d", uid); + error ("Security violation, JOB_COMPLETE RPC from uid %d", + uid); return ESLURM_USER_ID_MISSING; } @@ -1124,11 +1127,18 @@ job_complete (uint32_t job_id, uid_t uid) job_id, job_ptr->job_state); } + if (requeue && + job_ptr->details && + job_ptr->details->batch_flag) { + job_ptr->job_state = JOB_PENDING; + info ("Requeing job %u", job_ptr->job_id); + } else { + job_ptr->job_state = JOB_COMPLETE; + job_ptr->end_time = time(NULL); + delete_job_details(job_ptr); + delete_all_step_records(job_ptr); + } last_job_update = time (NULL); - job_ptr->job_state = JOB_COMPLETE; - job_ptr->end_time = time(NULL); - delete_job_details(job_ptr); - delete_all_step_records(job_ptr); return SLURM_SUCCESS; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index caf3ab37495c497fd05dc2774021b336603ac1fb..6df5abe9a602af72e12e32cff5e31a0fd27b53a8 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -45,17 +45,22 @@ #include <sys/types.h> #ifdef HAVE_LIBELAN3 -#include <src/common/qsw.h> -#endif +# include "src/common/qsw.h" +#endif /* HAVE_LIBELAN3 */ + +#ifdef WITH_PTHREADS +# include <pthread.h> +#endif /* WITH_PTHREADS */ + +#include "src/api/slurm.h" -#include <src/api/slurm.h> -#include <src/common/bitstring.h> -#include <src/common/list.h> -#include <src/common/log.h> -#include <src/common/macros.h> -#include <src/common/pack.h> -#include <src/common/slurm_protocol_api.h> -#include <src/common/xmalloc.h> +#include "src/common/bitstring.h" +#include "src/common/list.h" +#include "src/common/log.h" +#include "src/common/macros.h" +#include "src/common/pack.h" +#include "src/common/slurm_protocol_api.h" +#include "src/common/xmalloc.h" /* Perform full slurmctld's state every PERIODIC_CHECKPOINT seconds */ #define PERIODIC_CHECKPOINT 300 @@ -371,7 +376,7 @@ extern int job_cancel (uint32_t job_id, uid_t uid); extern int job_step_cancel (uint32_t job_id, uint32_t job_step_id, uid_t uid ); /* job_complete - note the completion the specified job */ -extern int job_complete (uint32_t job_id, uid_t uid); +extern int job_complete (uint32_t job_id, uid_t uid, bool requeue); /* job_step_complete - note the completion the specified job step*/ extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, uid_t uid);