diff --git a/NEWS b/NEWS index 19c4c0dd40152ad53d018676a92d4a7267c038ab..07b93a37390a6086e6ce59a11293d94b22ccdc91 100644 --- a/NEWS +++ b/NEWS @@ -90,8 +90,12 @@ documents those changes that are of interest to users and admins. failed. -- BLUEGENE - Fix, for if you are setting an subbp into an error mode where the subbp stated isn't the first ionode in a nodecard. - -- fix for backfill to not core when checking shared nodes. - -- fix for scontrol to not core when hitting just return in interactive mode. + -- Fix for backfill to not core when checking shared nodes. + -- Fix for scontrol to not core when hitting just return in interactive mode. + -- Improve sched/backfill logic with respect to shared nodes (multiple jobs + per node). + -- In sched/wiki (Maui interface) add job info fields QOS, RCLASS, DMEM and + TASKSPERNODE. * Changes in SLURM 2.0.1 ======================== diff --git a/doc/html/team.shtml b/doc/html/team.shtml index d5c73eb5bf82943022635cfee5d3adf5ac3becfc..1ad16b99b3d6076553f8a9fa5ef78f7bbb9cb7df 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -50,6 +50,7 @@ Linux NetworX and many other contributorss. <li>Donald Lipari (LLNL)</li> <li>Steven McDougall (SiCortex)</li> <li>Donna Mecozzi (LLNL)</li> +<li>Bjørn-Helge Mevik (University of Oslo, Norway)</li> <li>Chris Morrone (LLNL)</li> <li>Pere Munt (Barcelona Supercomputer Center, Spain)</li> <li>Michal Novotny (Masaryk University, Czech Republic)</li> @@ -76,6 +77,6 @@ Networking, Italy)</li> <li>Anne-Marie Wunderlin (Bull)</li> </ul> -<p style="text-align:center;">Last modified 27 May 2009</p> +<p style="text-align:center;">Last modified 16 June 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 2535bde4194ca5cd692c971595f9b57a7cfecaea..04388acc37804372cca7dace566005fa1f4dab8b 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -87,7 +87,7 @@ For example: \-\-begin=16:00 \-\-begin=now+1hour \-\-begin=now+60 (seconds by default) - \-\-begin=2010-01-20T12:34:67 + \-\-begin=2010-01-20T12:34:00 .fi .TP diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 08f591002f29e4d4cae9e5b93de4fe2dbdcbe118..cd6850da998f3a02bdbe95eff033e1cda8561d27 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -86,7 +86,7 @@ For example: \-\-begin=16:00 \-\-begin=now+1hour \-\-begin=now+60 (seconds by default) - \-\-begin=2010-01-20T12:34:67 + \-\-begin=2010-01-20T12:34:00 .fi .TP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 0c6987cb31438658da1fb8895241ceda0131908e..1e0ffca30b5b9b9c38f1298f2cc2f28c86debe6e 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -74,7 +74,7 @@ For example: \-\-begin=16:00 \-\-begin=now+1hour \-\-begin=now+60 (seconds by default) - \-\-begin=2010-01-20T12:34:67 + \-\-begin=2010-01-20T12:34:00 .fi .TP diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index f05a4586ff385ed0f4c6ee889d912fbc15142026..f97d954b02ba8718ba5ddac8b5cb681557f3a1df 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -248,6 +248,7 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, /* Try to schedule the job. First on dedicated nodes * then on shared nodes (if so configured). */ uint16_t orig_shared; + time_t now = time(NULL); orig_shared = job_ptr->details->shared; job_ptr->details->shared = 0; tmp_bitmap = bit_copy(*avail_bitmap); @@ -255,7 +256,8 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, max_nodes, req_nodes, SELECT_MODE_WILL_RUN); job_ptr->details->shared = orig_shared; - if ((rc != SLURM_SUCCESS) && (orig_shared != 0)) { + if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && + (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap= tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index 92b6eac811539fe29d0a96a264f7c4e3120dc623..245d0701c5a2cf86138a74451c00c1315765a3de 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -17,7 +17,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -51,6 +51,7 @@ static char * _dump_all_jobs(int *job_cnt, time_t update_time); static char * _dump_job(struct job_record *job_ptr, time_t update_time); static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr); +static uint16_t _get_job_tasks_per_node(struct job_record *job_ptr); static uint32_t _get_job_end_time(struct job_record *job_ptr); static char * _get_job_features(struct job_record *job_ptr); static uint32_t _get_job_min_disk(struct job_record *job_ptr); @@ -83,16 +84,19 @@ static char * _task_list(struct job_record *job_ptr); * WCLIMIT=<secs>; wall clock time limit, seconds * TASKS=<cpus>; CPUs required * [NODES=<nodes>;] count of nodes required + * [TASKSPERNODE=<cnt>;] tasks required per node * DPROCS=<cpus_per_task>; count of CPUs required per task * QUEUETIME=<uts>; submission time * STARTTIME=<uts>; time execution started * PARTITIONMASK=<partition>; partition name + * [DMEM=<mbytes>;] MB of memory required per cpu * RMEM=<MB>; MB of memory required * RDISK=<MB>; MB of disk space required * [COMPLETETIME=<uts>;] termination time * [SUSPENDTIME=<secs>;] seconds that job has been suspended - * [QOS=<quality_of_service>]; quality of service - * [ACCOUNT=<bank_account>]; bank account name + * [ACCOUNT=<bank_account>;] bank account name + * [QOS=<quality_of_service>;] quality of service + * [RCLASS=<resource_class>;] resource class * [COMMENT=<whatever>;] job dependency or account number * UNAME=<user_name>; user name * GNAME=<group_name>; group name @@ -205,7 +209,7 @@ static char * _dump_job(struct job_record *job_ptr, time_t update_time) { char tmp[16384], *buf = NULL; char *uname, *gname; - uint32_t end_time, suspend_time; + uint32_t end_time, suspend_time, min_mem; if (!job_ptr) return NULL; @@ -266,10 +270,18 @@ static char * _dump_job(struct job_record *job_ptr, time_t update_time) xstrcat(buf, tmp); if (!IS_JOB_FINISHED(job_ptr)) { + uint16_t tpn; snprintf(tmp, sizeof(tmp), "NODES=%u;", _get_job_min_nodes(job_ptr)); xstrcat(buf, tmp); + tpn = _get_job_tasks_per_node(job_ptr); + if (tpn > 0) { + snprintf(tmp, sizeof(tmp), + "TASKPERNODE=%u;", + tpn); + xstrcat(buf, tmp); + } } snprintf(tmp, sizeof(tmp), @@ -284,6 +296,13 @@ static char * _dump_job(struct job_record *job_ptr, time_t update_time) job_ptr->partition); xstrcat(buf, tmp); + min_mem = _get_job_min_mem(job_ptr); + if (min_mem & MEM_PER_CPU) { + snprintf(tmp, sizeof(tmp), + "DMEM=%u;", min_mem & (~MEM_PER_CPU)); + xstrcat(buf, tmp); + } + snprintf(tmp, sizeof(tmp), "RMEM=%u;RDISK=%u;", _get_job_min_mem(job_ptr), @@ -306,7 +325,7 @@ static char * _dump_job(struct job_record *job_ptr, time_t update_time) if (job_ptr->account) { /* allow QOS spec in form "qos-name" */ - if (!strncmp(job_ptr->account,"qos-",4)) { + if (!strncmp(job_ptr->account, "qos-", 4)) { snprintf(tmp, sizeof(tmp), "QOS=%s;", job_ptr->account + 4); } else { @@ -317,9 +336,33 @@ static char * _dump_job(struct job_record *job_ptr, time_t update_time) } if (job_ptr->comment && job_ptr->comment[0]) { - snprintf(tmp,sizeof(tmp), - "COMMENT=%s;", job_ptr->comment); - xstrcat(buf,tmp); + /* Parse comment for class/qos spec */ + char *copy; + char *cred, *value; + copy = xstrdup(job_ptr->comment); + cred = strtok(copy, ","); + while (cred != NULL) { + if (!strncmp(cred, "qos:", 4)) { + value = &cred[4]; + if (value[0] != '\0') { + snprintf(tmp, sizeof(tmp), + "QOS=%s;", value); + xstrcat(buf, tmp); + } + } else if (!strncmp(cred, "class:", 6)) { + value = &cred[6]; + if (value[0] != '\0') { + snprintf(tmp, sizeof(tmp), + "RCLASS=%s;", value); + xstrcat(buf, tmp); + } + } + cred = strtok(NULL, ","); + } + xfree(copy); + snprintf(tmp, sizeof(tmp), + "COMMENT=%s;", job_ptr->comment); + xstrcat(buf, tmp); } if (job_ptr->details && @@ -346,6 +389,16 @@ static uint16_t _get_job_cpus_per_task(struct job_record *job_ptr) return cpus_per_task; } + +static uint16_t _get_job_tasks_per_node(struct job_record *job_ptr) +{ + uint16_t tasks_per_node = 0; + + if (job_ptr->details && job_ptr->details->ntasks_per_node) + tasks_per_node = job_ptr->details->ntasks_per_node; + return tasks_per_node; +} + static uint32_t _get_job_min_mem(struct job_record *job_ptr) { if (job_ptr->details)