diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml index 66a09469ee73ef846d9bd1431799c7db0843ae4d..31c812119b3e8304414f6eb45e161bc54589c9af 100644 --- a/doc/html/preempt.shtml +++ b/doc/html/preempt.shtml @@ -15,29 +15,10 @@ Alternately, the low priority job(s) can be requeued and started using other resources if so configured in newer versions of SLURM. </P> <P> -In SLURM version 2.0 and earlier, high priority work is identified by the -priority of the job's partition and low priority jobs are always suspended. -The job preemption logic is within the <I>sched/gang</I> plugin. -In SLURM version 2.1 and higher, the job's partition priority or its -Quality Of Service (QOS) can be used to identify the which jobs can preempt -or be preempted by other jobs. -</P> -<P> -SLURM version 2.1 offers several options for the job preemption mechanism -including checkpoint, requeue, or cancel. -the option of requeuing low priority jobs -Checkpointed jobs are not automatically requeued or restarted. -Requeued jobs may restart faster by using different resources. -All of these new job preemption mechanisms release a job's memory space for -use by other jobs. -In SLURM version 2.1, some job preemption logic was moved into the -<I>select</I> plugin and main code base to permit use of both job preemption -plus the backfill scheduler plugin, <i>sched/backfill</I>. -</P> - -<P> -SLURM version 2.2 offers the ability to configure the preemption mechanism -used on a per partition or per QOS basis. +The job's partition priority or its Quality Of Service (QOS) can be used to +identify the which jobs can preempt or be preempted by other jobs. +SLURM offers the ability to configure the preemption mechanism used on a per +partition or per QOS basis. For example, jobs in a low priority queue may get requeued, while jobs in a medium priority queue may get suspended. </P> @@ -392,6 +373,6 @@ order to support ideal placements such as this, which can quickly complicate the design. Any and all help is welcome here! </P> -<p style="text-align:center;">Last modified 10 December 2012</p> +<p style="text-align:center;">Last modified 17 July 2013</p> <!--#include virtual="footer.txt"--> diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 41d9d3b9fee7b1f06d89f9ffa3d12dab0cec4e0c..544ec49c515884af8652a6da1b2245653f9f3e37 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -322,7 +322,8 @@ typedef enum { SRUN_EXEC, SRUN_STEP_MISSING, SRUN_REQUEST_SUSPEND, - SRUN_STEP_SIGNAL, /* BluegeneQ: srun forwards signal to runjob */ + SRUN_STEP_SIGNAL, /* for launch plugins aprun, poe and runjob, + * srun forwards signal to the launch command */ PMI_KVS_PUT_REQ = 7201, PMI_KVS_PUT_RESP, diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index fa592410ee689f715d18f582a087bccdf8e0c33a..8c7d3512dfe2457c59d5b08f8fa61d83fc5066f4 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -591,13 +591,11 @@ _handle_signal_task_local(int fd, slurmd_job_t *job, uid_t uid) int signal; int ltaskid; /* local task index */ - debug("_handle_signal_task_local for job %u.%u", - job->jobid, job->stepid); - safe_read(fd, &signal, sizeof(int)); safe_read(fd, <askid, sizeof(int)); + debug("_handle_signal_task_local for step=%u.%u uid=%d signal=%d", + job->jobid, job->stepid, (int) uid, signal); - debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("kill req from uid %ld for job %u.%u owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); @@ -669,14 +667,11 @@ _handle_signal_container(int fd, slurmd_job_t *job, uid_t uid) char *ptr = NULL; int target_node_id = 0; - debug("_handle_signal_container for job %u.%u", - job->jobid, job->stepid); - safe_read(fd, &sig, sizeof(int)); - - debug3(" uid = %d", uid); + debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", + job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { - debug("signal container req from uid %ld for job %u.%u " + error("signal container req from uid %ld for step=%u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; @@ -906,11 +901,10 @@ _handle_terminate(int fd, slurmd_job_t *job, uid_t uid) int rc = SLURM_SUCCESS; int errnum = 0; - debug("_handle_terminate for job %u.%u", - job->jobid, job->stepid); + debug("_handle_terminate for step=%u.%u uid=%d", + job->jobid, job->stepid, uid); step_terminate_monitor_start(job->jobid, job->stepid); - debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("terminate req from uid %ld for job %u.%u " "owned by uid %ld", @@ -1080,8 +1074,8 @@ _handle_suspend(int fd, slurmd_job_t *job, uid_t uid) int rc = SLURM_SUCCESS; int errnum = 0; - debug("_handle_suspend for job %u.%u", job->jobid, job->stepid); - debug3(" uid = %d", uid); + debug("_handle_suspend for step=%u.%u uid=%d", + job->jobid, job->stepi, (int) uid); if (!_slurm_authorized_user(uid)) { debug("job step suspend request from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); diff --git a/testsuite/expect/test3.7 b/testsuite/expect/test3.7 index 1cd3dec1bab2c85f6b7f8326ba711c9fbc4787da..1018a4bd4dcb03910172c0ed5806348908ccb2db 100755 --- a/testsuite/expect/test3.7 +++ b/testsuite/expect/test3.7 @@ -126,7 +126,8 @@ exec $bin_cc -o $file_prog ${file_prog}.c exec $bin_chmod 700 $file_prog # -# Submit two jobs to the same node +# Submit two jobs to the same node,. +# The first job includes srun, second only the application # set srun_pid [spawn $sbatch -N1 -t2 --output=$file_out1 $file_prog_sh1] expect { @@ -249,6 +250,7 @@ if {$exit_code == 0} { exec $bin_rm -f $file_out1 $file_out2 $file_prog $file_prog_sh1 $file_prog_sh2 send_user "\nSUCCESS\n" } else { - send_user "\nFAILURE: May be due to use of gang scheduler, a race conditions, or the ProcTrack plugin not identifying the application as part of the job\n" + send_user "\nFAILURE: May be due to use of gang scheduler, a race conditions, or the ProcTrack plugin not identifying the application as part of the job.\n" + send_user "\nFAILURE: launch/poe and proctrack/pgid are incompatible.\n" } exit $exit_code