Skip to content
Snippets Groups Projects
Commit 342e5161 authored by Moe Jette's avatar Moe Jette
Browse files
parent ced0cdca
No related branches found
No related tags found
No related merge requests found
Showing
with 617 additions and 97 deletions
......@@ -68,6 +68,10 @@ documents those changes that are of interest to users and admins.
are created in SLURM tables for future use without a reboot of the SLURM
daemons, but are not reported by any SLURM commands or APIs.
* Changes in SLURM 1.3.10
=========================
* Changes in SLURM 1.3.9
========================
-- Fix jobs being cancelled by ctrl-C to have correct cancelled state in
......
......@@ -93,6 +93,7 @@ AC_DEFUN([X_AC_DATABASES],
AC_MSG_RESULT([MySQL (non-threaded) test program built properly.])
AC_SUBST(MYSQL_LIBS)
AC_SUBST(MYSQL_CFLAGS)
AC_DEFINE(MYSQL_NOT_THREAD_SAFE, 1, [Define to 1 if with non thread-safe code])
AC_DEFINE(HAVE_MYSQL, 1, [Define to 1 if using MySQL libaries])
else
MYSQL_CFLAGS=""
......
......@@ -273,6 +273,9 @@
/* Enable multiple slurmd on one node */
#undef MULTIPLE_SLURMD
/* Define to 1 if with non thread-safe code */
#undef MYSQL_NOT_THREAD_SAFE
/* Define to 1 if you are building a production release. */
#undef NDEBUG
......
......@@ -25633,6 +25633,11 @@ echo "${ECHO_T}MySQL (non-threaded) test program built properly." >&6; }
 
 
 
cat >>confdefs.h <<\_ACEOF
#define MYSQL_NOT_THREAD_SAFE 1
_ACEOF
cat >>confdefs.h <<\_ACEOF
#define HAVE_MYSQL 1
_ACEOF
......
......@@ -60,12 +60,13 @@ Networking, Italy)</li>
<li>Federico Sacerdoti (D.E. Shaw)<li>
<li>Jeff Squyres (LAM MPI)</li>
<li>Prashanth Tamraparni (HP, India)</li>
<li>Adam Todorski (Rensselaer Polytechnic Institute)</li
<li>Adam Todorski (Rensselaer Polytechnic Institute)</li>
<li>Kevin Tew (LLNL/Bringham Young University)</li>
<li>Tim Wickberg (Rensselaer Polytechnic Institute)</li>
<li>Jay Windley (Linux NetworX)</li>
<li>Anne-Marie Wunderlin (Bull)</li>
</ul>
<p style="text-align:center;">Last modified 5 September 2008</p>
<p style="text-align:center;">Last modified 10 October 2008</p>
<!--#include virtual="footer.txt"-->
......@@ -24,7 +24,11 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_hostlist_destroy.3 \
man3/slurm_hostlist_shift.3 \
man3/slurm_allocate_resources.3 \
man3/slurm_allocate_resources_blocking.3 \
man3/slurm_allocation_lookup.3 \
man3/slurm_allocation_lookup_lite.3 \
man3/slurm_allocation_msg_thr_create.3 \
man3/slurm_allocation_msg_thr_destroy.3 \
man3/slurm_api_version.3 \
man3/slurm_checkpoint_able.3 \
man3/slurm_checkpoint_complete.3 \
......@@ -34,27 +38,38 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_checkpoint_error.3 \
man3/slurm_checkpoint_failed.3 \
man3/slurm_checkpoint_restart.3 \
man3/slurm_checkpoint_task_complete.3 \
man3/slurm_checkpoint_vacate.3 \
man3/slurm_clear_trigger.3 \
man3/slurm_complete_job.3 \
man3/slurm_complete_job_step.3 \
man3/slurm_confirm_allocation.3 \
man3/slurm_free_ctl_conf.3 \
man3/slurm_free_job_info_msg.3 \
man3/slurm_free_job_alloc_info_response_msg.3 \
man3/slurm_free_job_step_create_response_msg.3 \
man3/slurm_free_job_step_info_response_msg.3 \
man3/slurm_free_node_info.3 \
man3/slurm_free_node_info_msg.3 \
man3/slurm_free_partition_info.3 \
man3/slurm_free_partition_info_msg.3 \
man3/slurm_free_resource_allocation_response_msg.3 \
man3/slurm_free_slurmd_status.3 \
man3/slurm_free_submit_response_response_msg.3 \
man3/slurm_free_trigger_msg.3 \
man3/slurm_get_checkpoint_file_path.3 \
man3/slurm_get_end_time.3 \
man3/slurm_get_errno.3 \
man3/slurm_get_job_steps.3 \
man3/slurm_get_rem_time.3 \
man3/slurm_get_select_jobinfo.3 \
man3/slurm_get_triggers.3 \
man3/slurm_init_job_desc_msg.3 \
man3/slurm_init_part_desc_msg.3 \
man3/slurm_job_step_create.3 \
man3/slurm_job_step_launch_t_init.3 \
man3/slurm_job_step_layout_get.3 \
man3/slurm_job_step_layout_free.3 \
man3/slurm_job_will_run.3 \
man3/slurm_jobinfo_ctx_get.3 \
man3/slurm_kill_job.3 \
......@@ -64,8 +79,11 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_load_jobs.3 \
man3/slurm_load_node.3 \
man3/slurm_load_partitions.3 \
man3/slurm_load_slurmd_status.3 \
man3/slurm_notify_job.3 \
man3/slurm_perror.3 \
man3/slurm_pid2jobid.3 \
man3/slurm_ping.3 \
man3/slurm_print_ctl_conf.3 \
man3/slurm_print_job_info.3 \
man3/slurm_print_job_info_msg.3 \
......@@ -75,20 +93,38 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_print_node_table.3 \
man3/slurm_print_partition_info.3 \
man3/slurm_print_partition_info_msg.3 \
man3/slurm_print_slurmd_status.3 \
man3/slurm_read_hostfile.3 \
man3/slurm_reconfigure.3 \
man3/slurm_resume.3 \
man3/slurm_requeue.3 \
man3/slurm_set_debug_level.3 \
man3/slurm_set_trigger.3 \
man3/slurm_shutdown.3 \
man3/slurm_signal_job.3 \
man3/slurm_signal_job_step.3 \
man3/slurm_slurmd_status.3 \
man3/slurm_sprint_job_info.3 \
man3/slurm_sprint_job_step_info.3 \
man3/slurm_sprint_node_table.3 \
man3/slurm_sprint_partition_info.3 \
man3/slurm_step_ctx_create.3 \
man3/slurm_step_ctx_create_no_alloc.3 \
man3/slurm_step_ctx_daemon_per_node_hack.3 \
man3/slurm_step_ctx_destroy.3 \
man3/slurm_step_ctx_params_t_init.3 \
man3/slurm_step_ctx_get.3 \
man3/slurm_step_launch.3 \
man3/slurm_step_launch_fwd_signal.3 \
man3/slurm_step_launch_abort.3 \
man3/slurm_step_launch_wait_finish.3 \
man3/slurm_step_launch_wait_start.3 \
man3/slurm_strerror.3 \
man3/slurm_submit_batch_job.3 \
man3/slurm_suspend.3 \
man3/slurm_terminate_job.3 \
man3/slurm_terminate_job_step.3 \
man3/slurm_trigger.3 \
man3/slurm_update_job.3 \
man3/slurm_update_node.3 \
man3/slurm_update_partition.3
......
......@@ -265,7 +265,11 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_hostlist_destroy.3 \
man3/slurm_hostlist_shift.3 \
man3/slurm_allocate_resources.3 \
man3/slurm_allocate_resources_blocking.3 \
man3/slurm_allocation_lookup.3 \
man3/slurm_allocation_lookup_lite.3 \
man3/slurm_allocation_msg_thr_create.3 \
man3/slurm_allocation_msg_thr_destroy.3 \
man3/slurm_api_version.3 \
man3/slurm_checkpoint_able.3 \
man3/slurm_checkpoint_complete.3 \
......@@ -275,27 +279,38 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_checkpoint_error.3 \
man3/slurm_checkpoint_failed.3 \
man3/slurm_checkpoint_restart.3 \
man3/slurm_checkpoint_task_complete.3 \
man3/slurm_checkpoint_vacate.3 \
man3/slurm_clear_trigger.3 \
man3/slurm_complete_job.3 \
man3/slurm_complete_job_step.3 \
man3/slurm_confirm_allocation.3 \
man3/slurm_free_ctl_conf.3 \
man3/slurm_free_job_info_msg.3 \
man3/slurm_free_job_alloc_info_response_msg.3 \
man3/slurm_free_job_step_create_response_msg.3 \
man3/slurm_free_job_step_info_response_msg.3 \
man3/slurm_free_node_info.3 \
man3/slurm_free_node_info_msg.3 \
man3/slurm_free_partition_info.3 \
man3/slurm_free_partition_info_msg.3 \
man3/slurm_free_resource_allocation_response_msg.3 \
man3/slurm_free_slurmd_status.3 \
man3/slurm_free_submit_response_response_msg.3 \
man3/slurm_free_trigger_msg.3 \
man3/slurm_get_checkpoint_file_path.3 \
man3/slurm_get_end_time.3 \
man3/slurm_get_errno.3 \
man3/slurm_get_job_steps.3 \
man3/slurm_get_rem_time.3 \
man3/slurm_get_select_jobinfo.3 \
man3/slurm_get_triggers.3 \
man3/slurm_init_job_desc_msg.3 \
man3/slurm_init_part_desc_msg.3 \
man3/slurm_job_step_create.3 \
man3/slurm_job_step_launch_t_init.3 \
man3/slurm_job_step_layout_get.3 \
man3/slurm_job_step_layout_free.3 \
man3/slurm_job_will_run.3 \
man3/slurm_jobinfo_ctx_get.3 \
man3/slurm_kill_job.3 \
......@@ -305,8 +320,11 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_load_jobs.3 \
man3/slurm_load_node.3 \
man3/slurm_load_partitions.3 \
man3/slurm_load_slurmd_status.3 \
man3/slurm_notify_job.3 \
man3/slurm_perror.3 \
man3/slurm_pid2jobid.3 \
man3/slurm_ping.3 \
man3/slurm_print_ctl_conf.3 \
man3/slurm_print_job_info.3 \
man3/slurm_print_job_info_msg.3 \
......@@ -316,20 +334,38 @@ man3_MANS = man3/slurm_hostlist_create.3 \
man3/slurm_print_node_table.3 \
man3/slurm_print_partition_info.3 \
man3/slurm_print_partition_info_msg.3 \
man3/slurm_print_slurmd_status.3 \
man3/slurm_read_hostfile.3 \
man3/slurm_reconfigure.3 \
man3/slurm_resume.3 \
man3/slurm_requeue.3 \
man3/slurm_set_debug_level.3 \
man3/slurm_set_trigger.3 \
man3/slurm_shutdown.3 \
man3/slurm_signal_job.3 \
man3/slurm_signal_job_step.3 \
man3/slurm_slurmd_status.3 \
man3/slurm_sprint_job_info.3 \
man3/slurm_sprint_job_step_info.3 \
man3/slurm_sprint_node_table.3 \
man3/slurm_sprint_partition_info.3 \
man3/slurm_step_ctx_create.3 \
man3/slurm_step_ctx_create_no_alloc.3 \
man3/slurm_step_ctx_daemon_per_node_hack.3 \
man3/slurm_step_ctx_destroy.3 \
man3/slurm_step_ctx_params_t_init.3 \
man3/slurm_step_ctx_get.3 \
man3/slurm_step_launch.3 \
man3/slurm_step_launch_fwd_signal.3 \
man3/slurm_step_launch_abort.3 \
man3/slurm_step_launch_wait_finish.3 \
man3/slurm_step_launch_wait_start.3 \
man3/slurm_strerror.3 \
man3/slurm_submit_batch_job.3 \
man3/slurm_suspend.3 \
man3/slurm_terminate_job.3 \
man3/slurm_terminate_job_step.3 \
man3/slurm_trigger.3 \
man3/slurm_update_job.3 \
man3/slurm_update_node.3 \
man3/slurm_update_partition.3
......
......@@ -218,13 +218,19 @@ To clear a previously set value use the modify command with a new value of \-1.
\fIGrpCPUMins\fP=<max cpu hours>
Maximum number of CPU hours running jobs are able to be allocated in aggregate for
this association and all association which are children of this association.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIGrpCPUs\fP=<max cpus>
Maximum number of CPUs running jobs are able to be allocated in aggregate for
this association and all association which are children of this association.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIGrpJobs\fP=<max cpus>
......@@ -256,14 +262,20 @@ To clear a previously set value use the modify command with a new value of \-1.
Maximum number of CPU minutes each job is able to use in this account.
This is overridden if set directly on a user.
Default is the cluster's limit.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxCPUs\fP=<max cpus>
Maximum number of CPUs each job is able to use in this account.
This is overridden if set directly on a user.
Default is the cluster's limit.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxJobs\fP=<max jobs>
......@@ -315,11 +327,15 @@ Name of SLURM partition these limits apply to.
.TP
\fIQosLevel\fP<operator><comma separated list of qos names>
Specify Quality of Service that jobs are to run at for this account.
Now consisting of Normal, Standby, Expedite, and Exempt.
This is overridden if set directly on a user.
Setting an account's QosLevel to '' (two single quotes with nothing
between them) restores it's default setting.
(For use with MOAB only.)
Specify the default Quality of Service's that jobs are able to run at
for this account. To get a list of vaild QOS's use 'sacctmgr list qos'.
This value will override it's parents value and push down to it's
childern as the new default. Setting a QosLevel to '' (two single
quotes with nothing between them) restores it's default setting. You
can also use the operator += and \-= to add or remove certain QOS's
from a QOS list.
Valid <operator> values include:
.RS
.TP 5
......@@ -346,13 +362,19 @@ To clear a previously set value use the modify command with a new value of \-1.
\fIGrpCPUMins\fP=<max cpu hours>
Maximum number of CPU hours running jobs are able to be allocated in aggregate for
this association and all association which are children of this association.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIGrpCPUs\fP=<max cpus>
Maximum number of CPUs running jobs are able to be allocated in aggregate for
this association and all association which are children of this association.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIGrpJobs\fP=<max cpus>
......@@ -384,7 +406,20 @@ To clear a previously set value use the modify command with a new value of \-1.
Maximum number of CPU minutes each job is able to use in this account.
This is overridden if set directly on an account or user.
Default is no limit.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxCPUs\fP=<max cpus>
Maximum number of cpus each job is able to use in this account.
This is overridden if set directly on an account or user.
Default is no limit.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxJobs\fP=<max jobs>
......@@ -425,11 +460,13 @@ configuration file for some Slurm\-managed cluster.
.TP
\fIQosLevel\fP<operator><comma separated list of qos names>
Specify Quality of Service that jobs are to run at for this account.
Now consisting of Normal, Standby, Expedite, and Exempt.
This is overridden if set directly on an account user.
Setting an account's QosLevel to '' (two single quotes with nothing
between them) restores it's default setting.
(For use with MOAB only.)
Specify the default Quality of Service's that jobs are able to run at
for this cluster. To get a list of vaild QOS's use 'sacctmgr list qos'.
This value is overridden if a child has a QOS value directly set.
Setting a QosLevel to '' (two single quotes with nothing between them)
restores it's default setting. You can also use the operator += and
\-= to add or remove certain QOS's from a QOS list.
Valid <operator> values include:
.RS
.TP 5
......@@ -492,13 +529,19 @@ To clear a previously set value use the modify command with a new value of \-1.
.TP
\fIMaxCPUMins\fP=<max cpu minutes>
Maximum number of CPU minutes each job is able to use for this user.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxCPUs\fP=<max cpus>
Maximum number of CPUs each job is able to use for this user.
Default is the account's limit.
To clear a previously set value use the modify command with a new value of \-1.
To clear a previously set value use the modify command with a new
value of \-1. (NOTE: this limit is not currently enforced in SLURM.
You can still set this, but have to wait for future versions of SLURM
before it is enforced.)
.TP
\fIMaxJobs\fP=<max jobs>
......@@ -533,12 +576,13 @@ Name of SLURM partition these limits apply to.
.TP
\fIQosLevel\fP<operator><comma separated list of qos names>
Specify Quality of Service that jobs are to run at for this account.
Now consisting of Normal, Standby, Expedite, and Exempt.
This is overridden if set directly on an account user.
Setting an account's QosLevel to '' (two single quotes with nothing
between them) restores it's default setting.
Valid <operator> values include:
(For use with MOAB only.)
Specify the default Quality of Service's that jobs are able to run at
for this user. To get a list of vaild QOS's use 'sacctmgr list qos'.
This value will override it's parents value.
Setting a QosLevel to '' (two single quotes with nothing between them)
restores it's default setting. You can also use the operator += and
\-= to add or remove certain QOS's from a QOS list.
.RS
.TP 5
\fB=\fR
......@@ -591,71 +635,180 @@ is always a default for any cluster and does not need to be defined.
To edit/create a file start with a cluster line for the new cluster
\fBCluster\ \-\ cluster_name\fP
\fBCluster\ \-\ cluster_name:MaxNodesPerJob=15\fP
Anything included on this line will be the defaults for all
associations on this cluster. These options are as follows...
.TP
GrpCPUMins=
Maximum number of CPU hours running jobs are able to
be allocated in aggregate for this association and all association
which are children of this association. (NOTE: this limit is not
currently enforced in SLURM. You can still set this, but have to wait
for future versions of SLURM before it is enforced.)
.TP
GrpCPUs=
Maximum number of CPUs running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association. (NOTE: this limit is not currently
enforced in SLURM. You can still set this, but have to wait for future
versions of SLURM before it is enforced.)
.TP
GrpJobs=
Maximum number of running jobs in aggregate for this
association and all association which are children of this association.
.TP
GrpNodes=
Maximum number of nodes running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association.
.TP
GrpSubmitJobs=
Maximum number of jobs which can be in a pending or
running state at any time in aggregate for this association and all
association which are children of this association.
.TP
GrpWall=
Maximum wall clock time running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association.
.TP
FairShare=
To be used with a scheduler like MOAB to determine priority.
.TP
MaxJobs=
Maximum number of jobs the children of this account can run.
.TP
MaxNodesPerJob=
Maximum number of nodes per job the children of this account can run.
.TP
MaxProcSecondsPerJob=
Maximum cpu seconds children of this accounts jobs can run.
.TP
MaxWallDurationPerJob=
Maximum time (not related to job size) children of this accounts jobs can run.
.TP
QOS=
Comma separated list of Quality of Service names (Defined in sacctmgr).
.TP
Followed by Accounts you want in this fashion...
\fBAccount\ \-\ cs:MaxNodesPerJob=5:MaxJobs=4:MaxProcSecondsPerJob=20:FairShare=399:MaxWallDurationPerJob=40:Description='Computer Science':Organization='LC'\fP
\fBParent\ \-\ root\fP (Defined by default)
.br
\fBAccount\ \-\ cs\fP:MaxNodesPerJob=5:MaxJobs=4:MaxProcSecondsPerJob=20:FairShare=399:MaxWallDurationPerJob=40:Description='Computer Science':Organization='LC'
.br
\fBParent\ \-\ cs\fP
.br
\fBAccount\ \-\ test\fP:MaxNodesPerJob=1:MaxJobs=1:MaxProcSecondsPerJob=1:FairShare=1:MaxWallDurationPerJob=1:Description='Test Account':Organization='Test'
.TP
Any of the options after a ':' can be left out and they can be in any order.
If you want to add any sub accounts just list the Parent THAT HAS ALREADY
BEEN CREATED before the account line in this fashion...
.TP
All account options are
.br
Description= \- a brief description of the account
.br
FairShare= \- to be used with a scheduler like MOAB to determine priority
.br
MaxJobs= \- maximum number of jobs the children of this account can run
.br
MaxNodesPerJob= \- maximum number of nodes per job the children of this
account can run
.br
MaxProcSecondsPerJob= \- maximum cpu seconds children of this accounts
jobs can run
.br
MaxWallDurationPerJob= \- maximum time (not related to job size)
children of this accounts jobs can run
.br
Organization= \- Name of organization that owns this account
.br
QOS= \- Comma separated list of Quality of Service names (Defined in sacctmgr)
.br
\fBParent\ \-\ cs
.br
Account\ \-\ test:MaxNodesPerJob=1:MaxJobs=1:MaxProcSecondsPerJob=1:FairShare=1:MaxWallDurationPerJob=1:Description='Test Account':Organization='Test'\fP
.TP
Description=
A brief description of the account.
.TP
GrpCPUMins=
Maximum number of CPU hours running jobs are able to
be allocated in aggregate for this association and all association
which are children of this association. (NOTE: this limit is not
currently enforced in SLURM. You can still set this, but have to wait
for future versions of SLURM before it is enforced.)
.TP
GrpCPUs=
Maximum number of CPUs running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association. (NOTE: this limit is not currently
enforced in SLURM. You can still set this, but have to wait for future
versions of SLURM before it is enforced.)
.TP
GrpJobs=
Maximum number of running jobs in aggregate for this
association and all association which are children of this association.
.TP
GrpNodes=
Maximum number of nodes running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association.
.TP
GrpSubmitJobs=
Maximum number of jobs which can be in a pending or
running state at any time in aggregate for this association and all
association which are children of this association.
.TP
GrpWall=
Maximum wall clock time running jobs are able to be
allocated in aggregate for this association and all association which
are children of this association.
.TP
FairShare=
To be used with a scheduler like MOAB to determine priority.
.TP
MaxJobs=
Maximum number of jobs the children of this account can run.
.TP
MaxNodesPerJob=
Maximum number of nodes per job the children of this account can run.
.TP
MaxProcSecondsPerJob=
Maximum cpu seconds children of this accounts jobs can run.
.TP
MaxWallDurationPerJob=
Maximum time (not related to job size) children of this accounts jobs can run.
.TP
Organization=
Name of organization that owns this account.
.TP
QOS(=,+=,\-=)
Comma separated list of Quality of Service names (Defined in sacctmgr).
.TP
.TP
To add users to a account add a line like this after a Parent \- line
\fBParent\ \-\ test\fP
.br
\fBUser\ \-\ adam\fP:MaxNodesPerJob=2:MaxJobs=3:MaxProcSecondsPerJob=4:FairShare=1:MaxWallDurationPerJob=1:AdminLevel=Operator:Coordinator='test'
\fBUser\ \-\ lipari:MaxNodesPerJob=2:MaxJobs=3:MaxProcSecondsPerJob=4:FairShare=1:MaxWallDurationPerJob=1:AdminLevel=Operator:Coordinator='test'\fP
.TP
All user options are
.TP
AdminLevel=
Type of admin this user is (Administrator, Operator)
.br
AdminLevel= \- Type of admin this user is (Administrator, Operator)
\fBMust be defined on the first occurrence of the user.\fP
.TP
Coordinator=
Comma separated list of accounts this user is coordinator over
.br
Coordinator= \- Comma separated list of accounts this user is
coordinator over \fBMust be defined on the first occurrence of the user.\fP
.br
DefaultAccount= \- system wide default account name
\fBMust be defined on the first occurrence of the user.\fP
.TP
DefaultAccount=
system wide default account name
.br
FairShare= \- to be used with a scheduler like MOAB to determine priority
.br
MaxJobs= \- maximum number of jobs this user can run
.br
MaxNodesPerJob= \- maximum number of nodes per job this user can run
.br
MaxProcSecondsPerJob= \- maximum cpu seconds this user can run per job
.br
MaxWallDurationPerJob= \- maximum time (not related to job size) this
user can run
.br
QOS= \- Comma separated list of Quality of Service names (Defined in sacctmgr)
\fBMust be defined on the first occurrence of the user.\fP
.br
.TP
FairShare=
To be used with a scheduler like MOAB to determine priority.
.TP
MaxJobs=
Maximum number of jobs this user can run.
.TP
MaxNodesPerJob=
Maximum number of nodes per job this user can run.
.TP
MaxProcSecondsPerJob=
Maximum cpu seconds this user can run per job.
.TP
MaxWallDurationPerJob=
Maximum time (not related to job size) this user can run.
.TP
QOS(=,+=,\-=)
Comma separated list of Quality of Service names (Defined in sacctmgr).
.RE
.SH "EXAMPLES"
......@@ -671,13 +824,65 @@ QOS= \- Comma separated list of Quality of Service names (Defined in sacctmgr)
.br
> sacctmgr create user name=adam cluster=tux account=physics fairshare=10
.br
> sacctmgr modify user with name=adam cluster=tux account=physics set
> sacctmgr modify user name=adam cluster=tux account=physics set
maxjobs=2 maxtime=30:00
.br
> sacctmgr dump cluster=tux tux_data_file
.br
> sacctmgr load tux_data_file
.br
.br
When modifying an object placing the key words 'set' and the
optional 'where' is crtical to perform correctly below are examples to
produce correct results. As a rule of thumb any thing you put infront
of the set will be used as a quantifier. If you want to put a
quantifier after the key word 'set' you should use the key
word 'where'.
.br
.br
wrong> sacctmgr modify user name=adam set fairshare=10 cluster=tux
.br
.br
This will produce an error as the above line reads modify user adam
set fairshare=10 and cluster=tux.
.br
.br
right> sacctmgr modify user name=adam cluster=tux set fairshare=10
.br
right> sacctmgr modify user name=adam set fairshare=10 where cluster=tux
.br
.br
(For use with MOAB only)
When changing qos for something only use the '=' operator when wanting
to explitally set the qos to something. In most cases you will want
to use the '+=' or '-=' operator to either add to or remove from the
existing qos already in place.
.br
.br
If a user already has qos of normal,standby for a parent or it was
explicitly set you should use qos+=expedite to add this to the list in
this fashon.
.br
.br
> sacctmgr modify user name=adam set qos+=expedite
.br
.br
If you are looking to only add the qos expedite to only a certain
accoun and or cluster you can do that by specifing them in the
sacctmgr line.
.br
.br
> sacctmgr modify user name=adam acct=this cluster=tux set qos+=expedite
.br
.ec
.SH "COPYING"
......
......@@ -100,6 +100,11 @@ The \fIlist\fR of constraints may include multiple features separated
by ampersand (AND) and/or vertical bar (OR) operators.
For example: \fB\-\-constraint="opteron&video"\fR or
\fB\-\-constraint="fast|faster"\fR.
In the first example, only nodes having both the feature "opteron" AND
the feature "video" will be used.
There is no mechanism to specify that you want one node with feature
"opteron" and another node with feature "video" in that case that no
node has both features.
If only one of a set of possible options should be used for all allocated
nodes, then use the OR operator and enclose the options within square brackets.
For example: "\fB\-\-constraint="[rack1|rack2|rack3|rack4]"\fR might
......
......@@ -90,6 +90,11 @@ The \fIlist\fR of constraints may include multiple features separated
by ampersand (AND) and/or vertical bar (OR) operators.
For example: \fB\-\-constraint="opteron&video"\fR or
\fB\-\-constraint="fast|faster"\fR.
In the first example, only nodes having both the feature "opteron" AND
the feature "video" will be used.
There is no mechanism to specify that you want one node with feature
"opteron" and another node with feature "video" in that case that no
node has both features.
If only one of a set of possible options should be used for all allocated
nodes, then use the OR operator and enclose the options within square brackets.
For example: "\fB\-\-constraint="[rack1|rack2|rack3|rack4]"\fR might
......
......@@ -116,7 +116,7 @@ This is an independent command with no options meant for use in interactive mode
.TP
\fBversion\fP
Display the version number of sreport being executed.
-q or --quiet: equivalent to \"quiet\" command \n\
\-q or \-\-quiet: equivalent to \"quiet\" command \n\
.TP
\fB!!\fP
......@@ -128,9 +128,48 @@ Repeat the last command executed.
.TP
Various reports are as follows...
cluster - Utilization
job - Sizes
user - TopUsage
cluster \- AccountUtilizationByUser, UserUtilizationByAccount, Utilization
job \- Sizes
user \- TopUsage
.TP
.TP
REPORT DESCRIPTION
.RS
.TP
.B cluster AccountUtilizationByUser
This report will display account utilization as it appears on the
hierarchical tree. Starting with the specified account or the
root account by default this report will list the underlying
usage with a sum on each level. Use the 'tree' option to span
the tree for better visibility.
.TP
.B cluster UserUtilizationByAccount
This report will display users by account in order of utilization without
grouping multiple accounts by user into one, but displaying them
on separate lines.
.TP
.B cluster Utilization
This report will display total usage divided by Allocated, Down,
Idle, and resrved time for selected clusters. Reserved time
refers to time that a job was waiting for resources after the job
had become eligible. If the value is not of importance for you
the number should be grouped with idle time.
.TP
.B job Sizes
This report will dispay the amount of time used for job ranges
specified by the 'grouping=' option. Only a single level in the tree
is displayed defaulting to the root dir. If you specify other
accounts with the 'account=' option you will receive those accounts
sub accounts.
.TP
.B user TopUsage
Displays the top users on a cluster. Use the group option to group
accounts together. The default is to have a different line for each
user account combination.
.TP
Each report type has various options...
......@@ -158,6 +197,10 @@ CLUSTER
.TP
.B Names=<OPT>
List of clusters to include in report. Default is local cluster.
.TP
.B Tree
When used with the AccountUtilizationByUser report will span the
accounts as they in the hierarchy.
.RE
.TP
......@@ -182,6 +225,10 @@ List of jobs/steps to include in report. Default is all.
.B Partitions=<OPT>
List of partitions jobs ran on to include in report. Default is all.
.TP
.B PrintJobCount
When used with the Sizes report will print number of jobs ran instead
of time used.
.TP
.B Users=<OPT>
List of users jobs to include in report. Default is all.
.RE
......
......@@ -98,6 +98,11 @@ The \fIlist\fR of constraints may include multiple features separated
by ampersand (AND) and/or vertical bar (OR) operators.
For example: \fB\-\-constraint="opteron&video"\fR or
\fB\-\-constraint="fast|faster"\fR.
In the first example, only nodes having both the feature "opteron" AND
the feature "video" will be used.
There is no mechanism to specify that you want one node with feature
"opteron" and another node with feature "video" in that case that no
node has both features.
If only one of a set of possible options should be used for all allocated
nodes, then use the OR operator and enclose the options within square brackets.
For example: "\fB\-\-constraint="[rack1|rack2|rack3|rack4]"\fR might
......
.TH "Slurm API" "3" "April 2006" "Morris Jette" "Slurm job initiation functions"
.SH "NAME"
slurm_allocate_resources,
slurm_allocation_lookup, slurm_confirm_allocation,
slurm_allocate_resources, slurm_allocate_resources_blocking,
slurm_allocation_msg_thr_create, slurm_allocation_msg_thr_destroy,
slurm_allocation_lookup, slurm_allocation_lookup_lite,
slurm_confirm_allocation,
slurm_free_submit_response_response_msg, slurm_init_job_desc_msg,
slurm_job_will_run, slurm_submit_batch_job
slurm_job_will_run, slurm_read_hostfile, slurm_submit_batch_job
\- Slurm job initiation functions
.SH "SYNTAX"
.LP
......@@ -17,6 +19,28 @@ int \fBslurm_allocate_resources\fR (
.br
);
.LP
resource_allocation_response_msg_t *\fBslurm_allocate_resources_blocking\fR (
.br
job_desc_msg_t *\fIjob_desc_msg_ptr\fP,
.br
time_t \fItimeout\fP, void \fI(*pending_callback)(uint32_t job_id)\fP
.br
);
.LP
allocation_msg_thread_t *\fBslurm_allocation_msg_thr_create\fR (
.br
uint16_t *\fIport\fP,
.br
slurm_allocation_callbacks_t *\fIcallbacks\fP
.br
);
.LP
void *\fBslurm_allocation_msg_thr_destroy\fR (
.br
allocation_msg_thread_t *\fIslurm_alloc_msg_thr_ptr\fP
.br
);
.LP
int \fBslurm_allocation_lookup\fR {
.br
uint32_t \fIjobid\fP,
......@@ -25,6 +49,14 @@ int \fBslurm_allocation_lookup\fR {
.br
);
.LP
int \fBslurm_allocation_lookup_lite\fR {
.br
uint32_t \fIjobid\fP,
.br
resource_allocation_response_msg_t **\fIslurm_alloc_msg_pptr\fP
.br
);
.LP
int \fBslurm_confirm_allocation\fR (
.br
old_job_alloc_msg_t *\fIold_job_desc_msg_ptr\fP,
......@@ -57,6 +89,12 @@ int \fBslurm_job_will_run\fR (
.br
);
.LP
int \fBslurm_read_hostfile\fR (
.br
char *\fIfilename\fP, int \fIn\fP
.br
);
.LP
int \fBslurm_submit_batch_job\fR (
.br
job_desc_msg_t *\fIjob_desc_msg_ptr\fP,
......@@ -70,6 +108,10 @@ int \fBslurm_submit_batch_job\fR (
\fIjob_desc_msg_ptr\fP
Specifies the pointer to a job request specification. See slurm.h for full details
on the data structure's contents.
.TP
\fIcallbacks\fP
Specifies the pointer to a allocation callbacks structure. See
slurm.h for full details on the data structure's contents.
.TP
\fIold_job_desc_msg_ptr\fP
Specifies the pointer to a description of an existing job. See slurm.h for
......@@ -83,13 +125,21 @@ structure's contents.
.TP
\fIslurm_alloc_msg_ptr\fP
Specifies the pointer to the structure to be created and filled in by the function
\fIslurm_allocate_resources\fP, \fIslurm_allocation_lookup\fP,
\fIslurm_allocate_resources\fP,
\fIslurm_allocate_resources_blocking\fP,
\fIslurm_allocation_lookup\fP, \fIslurm_allocation_lookup_lite\fP,
\fIslurm_confirm_allocation\fP or \fIslurm_job_will_run\fP.
.TP
\fIslurm_alloc_msg_thr_ptr\fP
Specigies the pointer to the structure created and returned by the
function \fIslurm_allocation_msg_thr_create\fP. Must be destroyed
with function \fIslurm_allocation_msg_thr_destroy\fP.
.TP
\fIslurm_submit_msg_pptr\fP
Specifies the double pointer to the structure to be created and filled with a description
of the created job: job ID, etc. See slurm.h for full details on the data structure's contents.
.TP
of the created job: job ID, etc. See slurm.h for full details on the
data structure's contents.
.TP
\fIslurm_submit_msg_ptr\fP
Specifies the pointer to the structure to be created and filled in by the function \fIslurm_submit_batch_job\fP.
.SH "DESCRIPTION"
......@@ -100,16 +150,43 @@ count or time allocation are outside of the partition's limits then a job
entry will be created, a warning indication will be placed in the \fIerror_code\fP field of the response message, and the job will be left
queued until the partition's limits are changed.
Always release the response message when no longer required using
the function \fBslurm_free_resource_allocation_response_msg\fR.
.LP
the function \fBslurm_free_resource_allocation_response_msg\fR. This
function only makes the request once. If the allocation is not
avaliable immediately the node_cnt variable in the resp will be 0. If
you want a function that will block until either an error is recieved
or an allocation is granted you can use the
\fIslurm_allocate_resources_blocking\fP function described below.
.LP
\fBslurm_allocate_resources_blocking\fR Request a resource allocation for a
job. This call will block until the allocation is granted, an error
occurs, or the specified timeout limit is reached. The \fIpending_callback\fP
parameter will be called if the allocation is not avaliable
immediately and the immedite flag is not set in the request. This can
be used to get the jobid of the job while waiting for the allocation
to become avaliable. On failure NULL is returned and errno is set.
.LP
\fBslurm_allocation_msg_thr_create\fR Startup a message handler
talking with the controller dealing with messages from the controller
during an allocation. Callback functions are declared in the
\fIcallbacks\fP parameter and will be called when a corresponding
message is recieved from the controller. This message thread is
needed to receive messages from the controller about node failure in
an allocation and other important messages. Although technically not
required, it could be very helpful to inform about problems with the
allocation.
.LP
\fBslurm_allocation_msg_thr_destroy\fR Shutdown the message handler
talking with the controller dealing with messages from the controller during
an allocation.
.LP
\fBslurm_confirm_allocation\fR Return detailed information on a specific
existing job allocation. \fBOBSOLETE FUNCTION: Use slurm_allocation_lookup
instead.\fR This function may only be successfully executed by the job's
owner or user root.
.LP
\fBslurm_free_resource_allocation_response_msg\fR Release the storage generated in response
to a call of the function \fBslurm_allocate_resources\fR, or
\fBslurm_allocation_lookup\fR.
to a call of the function \fBslurm_allocate_resources\fR,
\fBslurm_allocation_lookup\fR, or \fBslurm_allocation_lookup_lite\fR.
.LP
\fBslurm_free_submit_response_msg\fR Release the storage generated in response
to a call of the function \fBslurm_submit_batch_job\fR.
......@@ -119,6 +196,13 @@ Execute this function before issuing a request to submit or modify a job.
.LP
\fBslurm_job_will_run\fR Determine if the supplied job description could be executed immediately.
.LP
\fBslurm_read_hostfile\fR Read a SLURM hostfile specified by
"filename". "filename" must contain a list of SLURM NodeNames, one
per line. Reads up to "n" number of hostnames from the file. Returns
a string representing a hostlist ranged string of the contents
of the file. This is a helper function, it does not contact any SLURM
daemons.
.LP
\fBslurm_submit_batch_job\fR Submit a job for later execution. Note that if
the job's requested node count or time allocation are outside of the partition's limits then a job entry will be created, a warning indication will be placed in the \fIerror_code\fP field of the response message, and the job will be left queued until the partition's limits are changed and resources are available. Always release the response message when no
longer required using the function \fBslurm_free_submit_response_msg\fR.
......@@ -186,10 +270,14 @@ the partition's time limit.
.LP
\fBSLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT\fR Timeout in communicating with
SLURM controller.
.SH "EXAMPLE"
.SH "NON-BLOCKING EXAMPLE"
.LP
#include <stdio.h>
.br
#include <stdlib.h>
.br
#include <signal.h>
.br
#include <slurm/slurm.h>
.br
#include <slurm/slurm_errno.h>
......@@ -202,16 +290,20 @@ int main (int argc, char *argv[])
.br
resource_allocation_response_msg_t* slurm_alloc_msg_ptr ;
.LP
slurm_init_job_desc_msg( &job_mesg );
slurm_init_job_desc_msg( &job_desc_msg );
.br
job_mesg. name = ("job01\0");
job_desc_msg. name = ("job01\0");
.br
job_mesg. min_memory = 1024;
job_desc_msg. job_min_memory = 1024;
.br
job_mesg. time_limit = 200;
job_desc_msg. time_limit = 200;
.br
job_mesg. num_nodes = 400;
job_desc_msg. min_nodes = 400;
.br
job_desc_msg. user_id = getuid();
.br
job_desc_msg. group_id = getgid();
.br
if (slurm_allocate_resources(&job_desc_msg,
.br
&slurm_alloc_msg_ptr)) {
......@@ -228,9 +320,77 @@ int main (int argc, char *argv[])
.br
slurm_alloc_msg_ptr\->job_id );
.br
if (slurm_job_kill(slurm_alloc_msg_ptr\->
if (slurm_kill_job(slurm_alloc_msg_ptr\->job_id, SIGKILL, 0)) {
.br
printf ("kill errno %d\\n", slurm_get_errno());
.br
exit (1);
.br
}
.br
printf ("canceled job_id %u\\n",
.br
slurm_alloc_msg_ptr\->job_id );
.br
slurm_free_resource_allocation_response_msg(
.br
slurm_alloc_msg_ptr);
.br
exit (0);
.br
}
.SH "BLOCKING EXAMPLE"
.LP
#include <stdio.h>
.br
#include <stdlib.h>
.br
job_id, SIGKILL)) {
#include <signal.h>
.br
#include <slurm/slurm.h>
.br
#include <slurm/slurm_errno.h>
.LP
int main (int argc, char *argv[])
.br
{
.br
job_desc_msg_t job_desc_msg;
.br
resource_allocation_response_msg_t* slurm_alloc_msg_ptr ;
.LP
slurm_init_job_desc_msg( &job_desc_msg );
.br
job_desc_msg. name = ("job01\0");
.br
job_desc_msg. job_min_memory = 1024;
.br
job_desc_msg. time_limit = 200;
.br
job_desc_msg. min_nodes = 400;
.br
job_desc_msg. user_id = getuid();
.br
job_desc_msg. group_id = getgid();
.br
if (!(slurm_alloc_msg_ptr =
.br
slurm_allocate_resources_blocking(&job_desc_msg, 0, NULL))) {
.br
slurm_perror ("slurm_allocate_resources_blocking error");
.br
exit (1);
.br
}
.br
printf ("Allocated nodes %s to job_id %u\\n",
.br
slurm_alloc_msg_ptr\->node_list,
.br
slurm_alloc_msg_ptr\->job_id );
.br
if (slurm_kill_job(slurm_alloc_msg_ptr\->job_id, SIGKILL, 0)) {
.br
printf ("kill errno %d\\n", slurm_get_errno());
.br
......
.so man3/slurm_allocate_resources.3
.so man3/slurm_allocate_resources.3
.so man3/slurm_allocate_resources.3
.so man3/slurm_allocate_resources.3
.so man3/slurm_checkpoint_error.3
.so man3/slurm_trigger.3
.so man3/slurm_free_job_info_msg.3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment