diff --git a/NEWS b/NEWS index 1d09ce2ce8e84c080fae75efad3ddbcab27ac81a..210d974a079d44f4348c78f77a24a56f718dcfa3 100644 --- a/NEWS +++ b/NEWS @@ -8,8 +8,12 @@ documents those changes that are of interest to users and admins. damon that should be running on the node is not running. Patch from Rod Schulz, Bull. -- Improve accuracy of response to "srun --test-only jobid=#". - -- Correct logic to perperly support --ntasks-per-node option in the + -- Correct logic to properly support --ntasks-per-node option in the select/cons_res plugin. Patch from Rod Schulz, Bull. + -- Fix bug in select/cons_res with respect to generic resource (gres) + scheduling which prevented some jobs from starting as soon as possible. + -- Fix memory leak in select/cons_res when backfill scheduling generic + resources (gres). * Changes in SLURM 2.3.0.pre4 ============================= diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index f23920a7038809315d32bf41aef32e9f813e2ae8..b6ffad71d42860dce578444c3eb294d0b064c09e 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -163,6 +163,12 @@ If set, then the allocated nodes must form a contiguous set. Not honored with the \fBtopology/tree\fR or \fBtopology/3d_torus\fR plugins, both of which can modify the node ordering. +.TP +\fB\-\-cores\-per\-socket\fR=<\fIcores\fR> +Restrict node selection to nodes with at least the specified number of +cores per socket. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. + .TP \fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR Bind tasks to CPUs. Used only when the task/affinity plugin is enabled. @@ -501,7 +507,7 @@ round\-robin fashion). For example, consider an allocation of three nodes each with two cpus. A four\-task cyclic distribution request will distribute those tasks to the nodes with tasks one and four on the first node, task two on the second node, and task three on the -third node. +third node. Note that when SelectType is select/cons_res, the same number of CPUs may not be allocated on each node. Task distribution will be round\-robin among all the nodes with CPUs yet to be assigned to tasks. @@ -603,8 +609,8 @@ NOTE: To have SLURM always report on the selected memory binding for all commands executed in a shell, you can enable verbose mode by setting the SLURM_MEM_BIND environment variable value to "verbose". -The following informational environment variables are set when \fB\-\-mem_bind\ -is in use: +The following informational environment variables are set when +\fB\-\-mem_bind\fR is in use: .nf SLURM_MEM_BIND_VERBOSE @@ -823,6 +829,12 @@ By default, no signal is sent before the job's end time. If a \fIsig_num\fR is specified without any \fIsig_time\fR, the default time will be 60 seconds. +.TP +\fB\-\-sockets\-per\-node\fR=<\fIsockets\fR> +Restrict node selection to nodes with at least the specified number of +sockets. See additional information under \fB\-B\fR option above when +task/affinity plugin is enabled. + .TP \fB\-t\fR, \fB\-\-time\fR=<\fItime\fR> Set a limit on the total run time of the job allocation. If the @@ -836,6 +848,12 @@ limit be imposed. Acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", "days\-hours", "days\-hours:minutes" and "days\-hours:minutes:seconds". +.TP +\fB\-\-threads\-per\-core\fR=<\fIthreads\fR> +Restrict node selection to nodes with at least the specified number of +threads per core. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. + .TP \fB\-\-time\-min\fR=<\fItime\fR> Set a minimum time limit on the job allocation. diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 6224d5bd0f19edbe0edd31015ea928df153f9894..264d83c41e2d05c871e5d6ff422417e3e57341d0 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -85,7 +85,7 @@ You may also specify \fImidnight\fR, \fInoon\fR, or with \fIAM\fR or \fIPM\fR for running in the morning or the evening. You can also say what day the job will be run, by specifying a date of the form \fIMMDDYY\fR or \fIMM/DD/YY\fR -\fIYYYY\-MM-DD\fR. Combine date and time using the following +\fIYYYY\-MM\-DD\fR. Combine date and time using the following format \fIYYYY\-MM\-DD[THH:MM[:SS]]\fR. You can also give times like \fInow + count time\-units\fR, where the time\-units can be \fIseconds\fR (default), \fIminutes\fR, \fIhours\fR, @@ -101,6 +101,7 @@ For example: \-\-begin=now+60 (seconds by default) \-\-begin=2010\-01\-20T12:34:00 .fi + .RS .PP Notes on date/time specifications: @@ -171,6 +172,12 @@ If set, then the allocated nodes must form a contiguous set. Not honored with the \fBtopology/tree\fR or \fBtopology/3d_torus\fR plugins, both of which can modify the node ordering. +.TP +\fB\-\-cores\-per\-socket\fR=<\fIcores\fR> +Restrict node selection to nodes with at least the specified number of +cores per socket. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. + .TP \fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR Bind tasks to CPUs. Used only when the task/affinity plugin is enabled. @@ -555,7 +562,7 @@ round\-robin fashion). For example, consider an allocation of three nodes each with two cpus. A four\-task cyclic distribution request will distribute those tasks to the nodes with tasks one and four on the first node, task two on the second node, and task three on the -third node. +third node. Note that when SelectType is select/cons_res, the same number of CPUs may not be allocated on each node. Task distribution will be round\-robin among all the nodes with CPUs yet to be assigned to tasks. @@ -658,7 +665,7 @@ all commands executed in a shell, you can enable verbose mode by setting the SLURM_MEM_BIND environment variable value to "verbose". The following informational environment variables are set when -\fB\-\-mem_bind is in use: +\fB\-\-mem_bind\fR is in use: .nf SLURM_MEM_BIND_VERBOSE @@ -810,7 +817,7 @@ This is related to \fB\-\-cpus\-per\-task\fR=\fIncpus\fR, but does not require knowledge of the actual number of cpus on each node. In some cases, it is more convenient to be able to request that no more than a specific number of tasks be invoked -on each node. Examples include submitting +on each node. Examples of this include submitting a hybrid MPI/OpenMP app where only one MPI "task/rank" should be assigned to each node while allowing the OpenMP portion to utilize all of the parallelism present in the node, or submitting a single @@ -935,6 +942,12 @@ By default, no signal is sent before the job's end time. If a \fIsig_num\fR is specified without any \fIsig_time\fR, the default time will be 60 seconds. +.TP +\fB\-\-sockets\-per\-node\fR=<\fIsockets\fR> +Restrict node selection to nodes with at least the specified number of +sockets. See additional information under \fB\-B\fR option above when +task/affinity plugin is enabled. + .TP \fB\-t\fR, \fB\-\-time\fR=<\fItime\fR> Set a limit on the total run time of the job allocation. If the @@ -953,6 +966,12 @@ limit be imposed. Acceptable time formats include "minutes", Specify the number of tasks to be launched per node. Equivalent to \fB\-\-ntasks\-per\-node\fR. +.TP +\fB\-\-threads\-per\-core\fR=<\fIthreads\fR> +Restrict node selection to nodes with at least the specified number of +threads per core. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. + .TP \fB\-\-time\-min\fR=<\fItime\fR> Set a minimum time limit on the job allocation. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index a7171d7a018c499c319c6be5c2468cab249a37ef..a1748ef2fb0a5bb0f6365f437825eeff9da8247c 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -158,9 +158,9 @@ Not honored for a job step's allocation. .TP \fB\-\-cores\-per\-socket\fR=<\fIcores\fR> -Allocate the specified number of cores per socket. This may be used to avoid -allocating more than one core per socket on multi\-core sockets. This option -is used for job allocations, but ignored for job step allocations. +Restrict node selection to nodes with at least the specified number of +cores per socket. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. .TP \fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR @@ -583,7 +583,7 @@ SLURM_HOSTFILE. If this variable is listed it will over ride any other method specified. If not set the method will default to block. Inside the hostfile must contain at minimum the number of hosts requested and be one per line or comma separated. If specifying a -task count (\fB\-n\fR, \fB\-\-ntasks\fR=<\fInumber\fR>), your tasks +task count (\fB\-n\fR, \fB\-\-ntasks\fR=<\fInumber\fR>), your tasks will be laid out on the nodes in the order of the file. .TP @@ -658,8 +658,8 @@ NOTE: To have SLURM always report on the selected memory binding for all commands executed in a shell, you can enable verbose mode by setting the SLURM_MEM_BIND environment variable value to "verbose". -The following informational environment variables are set when \fB\-\-mem_bind\ -is in use: +The following informational environment variables are set when +\fB\-\-mem_bind\fR is in use: .nf SLURM_MEM_BIND_VERBOSE @@ -1025,9 +1025,9 @@ the job. By default only errors are displayed. .TP \fB\-\-sockets\-per\-node\fR=<\fIsockets\fR> -Allocate the specified number of sockets per node. This may be used to avoid -allocating more than one task per node on multi\-socket nodes. This option -is used for job allocations, but ignored for job step allocations. +Restrict node selection to nodes with at least the specified number of +sockets. See additional information under \fB\-B\fR option above when +task/affinity plugin is enabled. .TP \fB\-T\fR, \fB\-\-threads\fR=<\fInthreads\fR> @@ -1084,6 +1084,12 @@ current job queue and all the other \fBsrun\fR arguments specifying the job. This limits \fBsrun's\fR behavior to just return information; no job is actually submitted. +.TP +\fB\-\-threads\-per\-core\fR=<\fIthreads\fR> +Restrict node selection to nodes with at least the specified number of +threads per core. See additional information under \fB\-B\fR option +above when task/affinity plugin is enabled. + .TP \fB\-\-time\-min\fR=<\fItime\fR> Set a minimum time limit on the job allocation. @@ -1097,12 +1103,6 @@ Acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", "days\-hours", "days\-hours:minutes" and "days\-hours:minutes:seconds". -.TP -\fB\-\-threads\-per\-core\fR=<\fIthreads\fR> -Allocate the specified number of threads per core. This may be used to avoid -allocating more than one task per core on hyper\-threaded nodes. This option -is used for job allocations, but ignored for job step allocations. - .TP \fB\-\-tmp\fR=<\fIMB\fR> Specify a minimum amount of temporary disk space. diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 9618f235f59a96c6a443486bb98309da16c18f7a..f11f248daa234b9cc258f27ee77221a2874cb038 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -643,7 +643,7 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, else gres_list = node_ptr->gres_list; gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - node_ptr->gres_list, test_only, + gres_list, test_only, core_map, core_start_bit, core_end_bit, job_ptr->job_id, node_ptr->name); @@ -761,7 +761,7 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, else gres_list = node_ptr->gres_list; gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - node_ptr->gres_list, true, + gres_list, true, NULL, 0, 0, job_ptr->job_id, node_ptr->name); if (gres_cpus == 0) { diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index d6dab3741eb2c05e8e252ce233f9b9bbaeba69ac..5da0d4494b9df33ed0daca6531205e7569db4f2f 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -493,10 +493,15 @@ static int _cr_job_list_sort(void *x, void *y) static void _destroy_node_data(struct node_use_record *node_usage, struct node_res_record *node_data) { + int i; + xfree(node_data); if (node_usage) { - if (node_usage->gres_list) - list_destroy(node_usage->gres_list); + for (i = 0; i < select_node_cnt; i++) { + if (node_usage[i].gres_list) { + list_destroy(node_usage[i].gres_list); + } + } xfree(node_usage); } }