diff --git a/NEWS b/NEWS index 6159eb7a6803cb93c10212456de552da3358b23e..ffc4093c42331e15f90c517417b2593945a629be 100644 --- a/NEWS +++ b/NEWS @@ -129,6 +129,8 @@ documents those changes that are of interest to users and administrators. was also given. -- Add note to slurm.conf man page about setting "--cpu_bind=no" as part of SallocDefaultCommand if a TaskPlugin is in use. + -- Set correct reason when a QOS' MaxTresMins is violated. + -- Insure that a job is completely launched before trying to suspend it. * Changes in Slurm 15.08.8 ========================== diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 2911facfafa412fc9c8641b23d758df9b9ed8a81..dfb2320199680c435c26c028255cb9cee2f6391b 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -1222,7 +1222,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, strict_checking)) { if (reason) *reason = _get_tres_state_reason( - tres_pos, WAIT_QOS_MAX_UNK_PER_JOB); + tres_pos, + WAIT_QOS_MAX_UNK_MINS_PER_JOB); debug2("job submit for user %s(%u): " "tres(%s) time limit request %"PRIu64" " "exceeds max per-job limit %"PRIu64" " diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index 29fe511c96c91de0a2540ac806d531a2c778c0b9..d043f1dd708dd9ca1fefd252990b533bca6fefad 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -579,11 +579,12 @@ static int _suspend_job(uint32_t job_id) /* job_suspend() returns ESLURM_DISABLED if job is already suspended */ if (rc == SLURM_SUCCESS) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) - info("gang: suspending %u", job_id); + info("gang: suspending JobID=%u", job_id); else - debug("gang: suspending %u", job_id); + debug("gang: suspending JobID=%u", job_id); } else if (rc != ESLURM_DISABLED) { - info("gang: suspending job %u: %s", job_id, slurm_strerror(rc)); + info("gang: suspending JobID=%u: %s", + job_id, slurm_strerror(rc)); } return rc; } @@ -599,11 +600,12 @@ static void _resume_job(uint32_t job_id) rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); if (rc == SLURM_SUCCESS) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) - info("gang: resuming %u", job_id); + info("gang: resuming JobID=%u", job_id); else - debug("gang: resuming %u", job_id); + debug("gang: resuming JobID=%u", job_id); } else if (rc != ESLURM_ALREADY_DONE) { - error("gang: resuming job %u: %s", job_id, slurm_strerror(rc)); + error("gang: resuming JobID=%u: %s", + job_id, slurm_strerror(rc)); } } diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 8f955517ef03b13fab9e95a96a1012b7f71baa35..47775dd734b38084914642cc3f63da7cf18cec07 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -6029,7 +6029,7 @@ static void _launch_complete_wait(uint32_t job_id) } if (j < JOB_STATE_CNT) /* Found job, ready to return */ break; - if (difftime(time(NULL), start) <= 3) { /* Retry for 3 secs */ + if (difftime(time(NULL), start) <= 9) { /* Retry for 9 secs */ debug2("wait for launch of job %u before suspending it", job_id); gettimeofday(&now, NULL); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 327cf0dd95c9b94d3857d3becbb81501086e9f73..51cd5aaf44f9bda55280dfc60be6fdfae119634e 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -58,18 +58,19 @@ # include "src/common/unsetenv.h" #endif -#include <sys/wait.h> -#include <sys/stat.h> -#include <sys/param.h> +#include <grp.h> #include <poll.h> -#include <unistd.h> +#include <pthread.h> #include <pwd.h> -#include <grp.h> #include <stdio.h> #include <string.h> -#include <sys/utsname.h> +#include <sys/param.h> +#include <sys/stat.h> #include <sys/types.h> +#include <sys/utsname.h> +#include <sys/wait.h> #include <time.h> +#include <unistd.h> #if HAVE_STDLIB_H # include <stdlib.h> @@ -196,6 +197,7 @@ static int _send_exit_msg(stepd_step_rec_t *job, uint32_t *tid, int n, static void _wait_for_children_slurmstepd(stepd_step_rec_t *job); static int _send_pending_exit_msgs(stepd_step_rec_t *job); static void _send_step_complete_msgs(stepd_step_rec_t *job); +static void _set_job_state(stepd_step_rec_t *job, slurmstepd_state_t new_state); static void _wait_for_all_tasks(stepd_step_rec_t *job); static int _wait_for_any_task(stepd_step_rec_t *job, bool waitflag); @@ -973,6 +975,14 @@ extern void agent_queue_request(void *dummy) "checkpoint plugin"); } +static void _set_job_state(stepd_step_rec_t *job, slurmstepd_state_t new_state) +{ + slurm_mutex_lock(&job->state_mutex); + job->state = new_state; + pthread_cond_signal(&job->state_cond); + slurm_mutex_unlock(&job->state_mutex); +} + static int _spawn_job_container(stepd_step_rec_t *job) { jobacctinfo_t *jobacct = NULL; @@ -1014,7 +1024,7 @@ static int _spawn_job_container(stepd_step_rec_t *job) jobacct_gather_add_task(pid, &jobacct_id, 1); container_g_add_cont(job->jobid, job->cont_id); - job->state = SLURMSTEPD_STEP_RUNNING; + _set_job_state(job, SLURMSTEPD_STEP_RUNNING); if (!conf->job_acct_gather_freq) jobacct_gather_stat_task(0); @@ -1221,7 +1231,7 @@ job_manager(stepd_step_rec_t *job) xsignal_block (mgr_sigarray); reattach_job = job; - job->state = SLURMSTEPD_STEP_RUNNING; + _set_job_state(job, SLURMSTEPD_STEP_RUNNING); /* Attach slurmstepd to system cgroups, if configured */ attach_system_cgroup_pid(getpid()); @@ -1240,7 +1250,7 @@ job_manager(stepd_step_rec_t *job) acct_gather_profile_g_node_step_end(); acct_gather_profile_fini(); - job->state = SLURMSTEPD_STEP_ENDING; + _set_job_state(job, SLURMSTEPD_STEP_ENDING); if (!job->batch && (switch_g_job_fini(job->switch_job) < 0)) { @@ -1259,6 +1269,7 @@ fail2: * terminated before the switch window can be released by * switch_g_job_postfini(). */ + _set_job_state(job, SLURMSTEPD_STEP_ENDING); step_terminate_monitor_start(job->jobid, job->stepid); if (job->cont_id != 0) { proctrack_g_signal(job->cont_id, SIGKILL); @@ -1311,6 +1322,7 @@ fail1: /* If interactive job startup was abnormal, * be sure to notify client. */ + _set_job_state(job, SLURMSTEPD_STEP_ENDING); if (rc != 0) { error("job_manager exiting abnormally, rc = %d", rc); _send_launch_resp(job, rc); diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index e9459ca58b69083b3f86e127a19ad39dc881e8e9..0238ad0c3442dce2a9b164b314cffd095b921bd5 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -1372,6 +1372,19 @@ rwfail: return SLURM_FAILURE; } +/* Wait for the job to completely start before trying to suspend it. */ +static void _wait_for_job_init(stepd_step_rec_t *job) +{ + slurm_mutex_lock(&job->state_mutex); + while (1) { + if (job->state != SLURMSTEPD_STEP_STARTING) { + slurm_mutex_unlock(&job->state_mutex); + break; + } + pthread_cond_wait(&job->state_cond, &job->state_mutex); + } +} + static int _handle_suspend(int fd, stepd_step_rec_t *job, uid_t uid) { @@ -1393,6 +1406,8 @@ _handle_suspend(int fd, stepd_step_rec_t *job, uid_t uid) goto done; } + _wait_for_job_init(job); + if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index 34988f9372e96d0acdc0e0b50b9ec6d8a9c0d338..c89ae0a99dec2d5b6bfc4fc22407eb8460f73faa 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -268,7 +268,9 @@ stepd_step_rec_create(launch_tasks_request_msg_t *msg, uint16_t protocol_version return NULL; } - job->state = SLURMSTEPD_STEP_STARTING; + job->state = SLURMSTEPD_STEP_STARTING; + pthread_cond_init(&job->state_cond, NULL); + pthread_mutex_init(&job->state_mutex, NULL); job->node_tasks = msg->tasks_to_launch[nodeid]; i = sizeof(uint16_t) * msg->nnodes; job->task_cnts = xmalloc(i); @@ -443,7 +445,9 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) job = xmalloc(sizeof(stepd_step_rec_t)); - job->state = SLURMSTEPD_STEP_STARTING; + job->state = SLURMSTEPD_STEP_STARTING; + pthread_cond_init(&job->state_cond, NULL); + pthread_mutex_init(&job->state_mutex, NULL); if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; job->node_tasks = 1; diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h index 824a9fa82d8f8f9cfa1182de9067806f03aa5b35..25eefb94776e5ce15491fca84ef51f0fbbcb52a5 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.h +++ b/src/slurmd/slurmstepd/slurmstepd_job.h @@ -126,7 +126,9 @@ typedef struct { /* MPMD specifications, needed for Cray */ } mpmd_set_t; typedef struct { - slurmstepd_state_t state; + slurmstepd_state_t state; /* Job state */ + pthread_cond_t state_cond; /* Job state conditional */ + pthread_mutex_t state_mutex; /* Job state mutex */ uint32_t jobid; /* Current SLURM job id */ uint32_t stepid; /* Current step id (or NO_VAL) */ uint32_t array_job_id; /* job array master job ID */ diff --git a/testsuite/expect/inc21.21_tests b/testsuite/expect/inc21.21_tests index 0abc7aee74dc94f40c83336068a2c393d2502fda..26465408dcbc1cfb145acd05f2acf3ab861e2868 100644 --- a/testsuite/expect/inc21.21_tests +++ b/testsuite/expect/inc21.21_tests @@ -154,6 +154,12 @@ proc inc21_21_grp_test { test_type limit } { send_user "\n===== Test $test_type " send_user "(Within: inc21.21_tests function: inc21_21_grp_test) =====\n" + if { ![string compare $test_type "grpcpumins"] && + ![test_enforce_safe_set] } { + send_user "\nWARNING: This test can't be run without AccountingStorageEnforce having \"safe\" in it\n" + return $exit_code + } + # Check and see if it is a cpu test if { [string compare $test_type "grpcpus"] == 0 || [string compare $test_type "grpcpumins"] == 0 ||