diff --git a/NEWS b/NEWS index 17da4cc84cf3dae92b0f27997fd1331528ff313e..676e8cb88fe2c312a8b67c7cf90c5792f71b8329 100644 --- a/NEWS +++ b/NEWS @@ -123,8 +123,8 @@ documents those changes that are of interest to users and admins. -- Added SchedulerParameters value of Ignore_NUMA. -- Fix issues with code when using automake 1.14.1 -- select/cons_res plugin: Fix memory leak related to job preemption. - -- After reconfig rebuild the job node counters only for jobs that has - not finished yet otherwise if requeued the job may enter an invalid + -- After reconfig rebuild the job node counters only for jobs that have + not finished yet, otherwise if requeued the job may enter an invalid COMPLETING state. -- Do not purge the script and environment files for completed jobs on slurmctld reconfiguration or restart (they might be later requeued). @@ -146,10 +146,10 @@ documents those changes that are of interest to users and admins. primary and there is a split brain problem). -- Fix scontrol to accept update jobid=# numtasks=# -- If the backup slurmctld assumes primary status, then do NOT purge any - job state files (batch script and environment files) but if any attempt - is made to re-use them consider this a fatal error. It may indicate that - multiple primary slurmctld daemons are active (e.g. both backup and primary - are functioning as primary and there is a split brain problem). + job state files (batch script and environment files) and do not re-use them. + This may indicate that multiple primary slurmctld daemons are active (e.g. + both backup and primary are functioning as primary and there is a split + brain problem). -- Set correct error code when requeuing a completing/pending job -- When checking for if dependency of type afterany, afterok and afternotok don't clear the dependency if the job is completing. @@ -164,6 +164,11 @@ documents those changes that are of interest to users and admins. format "#_# (#)" rather than just the internal job ID. -- Set the number of free licenses to be 0 if the global license count decreases and total is less than in use. + -- Add DebugFlag of BackfillMap. Previously a DebugFlag value of Backfill + logged information about what it was doing plus a map of expected resouce + use in the future. Now that very verbose resource use map is only logged + with a DebugFlag value of BackfillMap + -- Fix slurmstepd core dump. * Changes in Slurm 14.03.3-2 ============================ diff --git a/doc/html/quickstart_admin.shtml b/doc/html/quickstart_admin.shtml index d5303995370791f210929c8320f6a85d38bf08b8..de0c63cbfc7ec92affa3664a67f38b0c8b2a398d 100644 --- a/doc/html/quickstart_admin.shtml +++ b/doc/html/quickstart_admin.shtml @@ -691,7 +691,13 @@ but may also include very minor enhancements.</p> release number as the Slurmctld daemons. In other words, when changing the version to a higher release number (e.g from 2.4 to 2.5) <b>always upgrade the SlurmDBD daemon first</b>. -The slurmctld daemon must also be upgraded before or at the same time as +Database table changes may be required for the upgrade, for example +adding new fields to existing tables. +If the database contains a large number of entries, <b>the SlurmDBD daemon +may require tens of minutes to update the database and be unresponsive +during this time interval</b>.</p> + +<p>The slurmctld daemon must also be upgraded before or at the same time as the slurmd daemons on the compute nodes. Generally upgrading Slurm on all of the login and compute nodes is recommended, although rolling upgrades are also possible (i.e. upgrading the head node(s) @@ -718,7 +724,12 @@ versions (e.g. 2.5.0-pre1 to 2.5.0-pre2). We'll try to note these cases in the NEWS file. Contents of major releases are also described in the RELEASE_NOTES file.</p> -<p>Be mindful of your configured SlurmdTimeout and SlurmctldTimeout values. +<p>The libslurm.so version is increasaed every major release. +So things like MPI libraries with Slurm integration should be recompiled. +Sometimes it works to just symlink the old so name(s) to the new one, but this +has no guarantee of working.</p> + +<p><b>Be mindful of your configured SlurmdTimeout and SlurmctldTimeout values</b>. If your Slurm daemon down time associated with an upgrade take longer than the timeout value, nodes may be marked DOWN and their jobs killed. You can either increase timeout values during an upgrade or insure that your @@ -765,6 +776,6 @@ options such as mysql and gui tools via a configuration menu.</p> </pre> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 29 May 2014</p> +<p style="text-align:center;">Last modified 11 June 2014</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 4876f8bcd25105c7aa95cdf1e19b47c2063d6f99..d609a2b0506344e18f2002160b0643e51659c860 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -441,6 +441,11 @@ Valid subsystems available today (with more to come) include: \fBBackfill\fR Backfill scheduler details .TP +\fBBackfillMap\fR +Backfill scheduler to log a very verbose map of reserved resources through +time. Combine with \fBBackfill\fR for a verbose and complete view of the +backfill scheduler's work. +.TP \fBBGBlockAlgo\fR BlueGene block selection details .TP diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 12c66c0597b8991569d4c2c8e2d3b0a6427460d4..3cdef3588d415f1cbb7600ff7b7114945ec0efe5 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -2028,7 +2028,8 @@ typedef struct reservation_name_msg { #define DEBUG_FLAG_FILESYSTEM 0x00800000 /* AcctGatherFilesystem plugin */ #define DEBUG_FLAG_JOB_CONT 0x01000000 /* JobContainer plugin */ #define DEBUG_FLAG_TASK 0x02000000 /* TaskType plugin */ -#define DEBUG_FLAG_PROTOCOL 0x04000000 +#define DEBUG_FLAG_PROTOCOL 0x04000000 /* Communication protocol */ +#define DEBUG_FLAG_BACKFILL_MAP 0x08000000 /* Backfill scheduler node map */ #define GROUP_FORCE 0x8000 /* if set, update group membership * info even if no updates to diff --git a/src/common/read_config.c b/src/common/read_config.c index f4382d2c54169994177513996e538ae81248e07f..a968689629518f41415c7dde1def4fa08fca9583 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -4170,6 +4170,11 @@ extern char * debug_flags2str(uint32_t debug_flags) xstrcat(rc, ","); xstrcat(rc, "Backfill"); } + if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) { + if (rc) + xstrcat(rc, ","); + xstrcat(rc, "BackfillMap"); + } if (debug_flags & DEBUG_FLAG_BG_ALGO) { if (rc) xstrcat(rc, ","); @@ -4318,6 +4323,8 @@ extern uint32_t debug_str2flags(char *debug_flags) while (tok) { if (strcasecmp(tok, "Backfill") == 0) rc |= DEBUG_FLAG_BACKFILL; + else if (strcasecmp(tok, "BackfillMap") == 0) + rc |= DEBUG_FLAG_BACKFILL_MAP; else if (strcasecmp(tok, "BGBlockAlgo") == 0) rc |= DEBUG_FLAG_BG_ALGO; else if (strcasecmp(tok, "BGBlockAlgoDeep") == 0) diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 63f25550869e5928992c899af280df041cd47245..b6614edc49237829bf0e1990b326d679949975ae 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -387,7 +387,7 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); - *avail_bitmap= tmp_bitmap; + *avail_bitmap = tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, @@ -678,7 +678,7 @@ static int _attempt_backfill(void) bool already_counted; uint32_t reject_array_job_id = 0; struct part_record *reject_array_part = NULL; - uint32_t job_start_cnt = 0; + uint32_t job_start_cnt = 0, start_time; time_t config_update = slurmctld_conf.last_update; time_t part_update = last_part_update; struct timeval start_tv; @@ -717,7 +717,10 @@ static int _attempt_backfill(void) job_queue = build_job_queue(true, true); if (list_count(job_queue) == 0) { - debug("backfill: no jobs to backfill"); + if (debug_flags & DEBUG_FLAG_BACKFILL) + info("backfill: no jobs to backfill"); + else + debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } @@ -742,7 +745,7 @@ static int _attempt_backfill(void) node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; - if (debug_flags & DEBUG_FLAG_BACKFILL) + if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if (max_backfill_job_per_part) { @@ -773,7 +776,8 @@ static int _attempt_backfill(void) if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " - "after testing %d jobs, %s", + "after testing %u(%d) jobs, %s", + slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || @@ -781,8 +785,10 @@ static int _attempt_backfill(void) (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " - "breaking out after testing %d " - "jobs", job_test_count); + "breaking out after testing " + "%u(%d) jobs", + slurmctld_diag_stats.bf_last_depth, + job_test_count); } rc = 1; xfree(job_queue_rec); @@ -808,7 +814,10 @@ static int _attempt_backfill(void) } orig_time_limit = job_ptr->time_limit; part_ptr = job_queue_rec->part_ptr; + job_test_count++; + slurmctld_diag_stats.bf_last_depth++; + already_counted = false; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) @@ -828,9 +837,6 @@ static int _attempt_backfill(void) if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); - slurmctld_diag_stats.bf_last_depth++; - already_counted = false; - if (max_backfill_job_per_part) { bool skip_job = false; for (j = 0; j < bf_parts; j++) { @@ -961,7 +967,8 @@ static int _attempt_backfill(void) if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " - "after testing %d jobs, %s", + "after testing %u(%d) jobs, %s", + slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || @@ -969,8 +976,10 @@ static int _attempt_backfill(void) (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " - "breaking out after testing %d " - "jobs", job_test_count); + "breaking out after testing " + "%u(%d) jobs", + slurmctld_diag_stats.bf_last_depth, + job_test_count); } rc = 1; break; @@ -1082,7 +1091,7 @@ static int _attempt_backfill(void) already_counted = true; } - if (debug_flags & DEBUG_FLAG_BACKFILL) + if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_job_test(job_ptr, avail_bitmap, start_res); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); @@ -1098,7 +1107,7 @@ static int _attempt_backfill(void) job_ptr->start_time = start_res; last_job_update = now; } - if (job_ptr->start_time <= now) { + if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { @@ -1138,10 +1147,16 @@ static int _attempt_backfill(void) job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { - /* Planned to start job, but something bad - * happened. */ - job_ptr->start_time = 0; - break; + if (debug_flags & DEBUG_FLAG_BACKFILL) { + info("backfill: planned start of job %u" + " failed: %s", job_ptr->job_id, + slurm_strerror(rc)); + } + /* Drop through and reserve these resources. + * Likely due to state changes during sleep. + * Make best-effort based upon original state */ + job_ptr->time_limit = orig_time_limit; + later_start = 0; } else { /* Started this job, move to next one */ reject_array_job_id = 0; @@ -1152,15 +1167,24 @@ static int _attempt_backfill(void) if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); + job_start_cnt++; if (max_backfill_jobs_start && - (++job_start_cnt >= max_backfill_jobs_start)) + (job_start_cnt >= max_backfill_jobs_start)){ + if (debug_flags & DEBUG_FLAG_BACKFILL) { + info("backfill: bf_max_job_start" + " limit of %d reached", + max_backfill_jobs_start); + } break; + } continue; } - } else + } else { job_ptr->time_limit = orig_time_limit; + } - if (later_start && (job_ptr->start_time > later_start)) { + if (later_start && + (job_ptr->start_time > (later_start+backfill_resolution))) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; @@ -1173,13 +1197,22 @@ static int _attempt_backfill(void) } if (node_space_recs >= max_backfill_job_cnt) { - /* Already have too many jobs to deal with */ + if (debug_flags & DEBUG_FLAG_BACKFILL) { + info("backfill: table size limit of %u reached", + max_backfill_job_cnt); + } break; } + start_time = job_ptr->start_time; end_reserve = job_ptr->start_time + (time_limit * 60); - if (_test_resv_overlap(node_space, avail_bitmap, - job_ptr->start_time, end_reserve)) { + start_time = (start_time / backfill_resolution) * + backfill_resolution; + end_reserve = (end_reserve / backfill_resolution) * + backfill_resolution; + if ((job_ptr->start_time > now) && + _test_resv_overlap(node_space, avail_bitmap, + start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ @@ -1200,9 +1233,9 @@ static int _attempt_backfill(void) xfree(job_ptr->sched_nodes); job_ptr->sched_nodes = bitmap2node_name(avail_bitmap); bit_not(avail_bitmap); - _add_reservation(job_ptr->start_time, end_reserve, + _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); - if (debug_flags & DEBUG_FLAG_BACKFILL) + if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); } xfree(bf_part_jobs); @@ -1226,7 +1259,8 @@ static int _attempt_backfill(void) _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; - info("backfill: completed testing %d jobs, %s", + info("backfill: completed testing %u(%d) jobs, %s", + slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } return rc; @@ -1349,12 +1383,19 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, bool placed = false; int i, j; - /* If we decrease the resolution of our timing information, this can - * decrease the number of records managed and increase performance */ - start_time = (start_time / backfill_resolution) * backfill_resolution; - end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; +#if 0 + info("add job start:%u end:%u", start_time, end_reserve); + for (j = 0; ; ) { + info("node start:%u end:%u", + (uint32_t) node_space[j].begin_time, + (uint32_t) node_space[j].end_time); + if ((j = node_space[j].next) == 0) + break; + } +#endif - for (j=0; ; ) { + start_time = MAX(start_time, node_space[0].begin_time); + for (j = 0; ; ) { if (node_space[j].end_time > start_time) { /* insert start entry record */ i = *node_space_recs; @@ -1389,6 +1430,9 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, (*node_space_recs)++; break; } + if (end_reserve == node_space[j].end_time) { + break; + } } break; } @@ -1396,7 +1440,7 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, break; } - for (j=0; ; ) { + for (j = 0; ; ) { if ((node_space[j].begin_time >= start_time) && (node_space[j].end_time <= end_reserve)) bit_and(node_space[j].avail_bitmap, res_bitmap);