diff --git a/NEWS b/NEWS index b614c39103518aec7ebffaab8a16d306591664ea..e035f4c92032395710ba4935d024ce4e73ac6236 100644 --- a/NEWS +++ b/NEWS @@ -157,6 +157,8 @@ documents those changes that are of interest to users and admins. is already running. -- Email messages for job array events print now use the job ID using the format "#_# (#)" rather than just the internal job ID. + -- Set the number of free licenses to be 0 if the global license count decreases + and total is less than in use. * Changes in Slurm 14.03.3-2 ============================ diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 00a7a3cce4317fdeadf70c598a8268f823ddce38..1b854f08e9209a90a254d15f9c0749de7b5f6a72 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -1809,7 +1809,7 @@ case_sensitive = False <p><a name="ha_db"><b>54. How critical is configuring high availability for my database?</b></a></br> <ul> -<li>Consider if you really need mysql failover. Short outage of slurdbd is not +<li>Consider if you really need mysql failover. Short outage of slurmdbd is not a problem, because slurmctld will store all data in memory and send it to slurmdbd when it's back operating. The slurmctld daemon will also cache all user limits and fair share information.</li> diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 33c29295a6069322b8af258bef6a74eb89381844..28f118fcbca35860cc3e42441b317bd11f398cd5 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -11376,10 +11376,15 @@ _unpack_license_info_msg(license_info_msg_t **msg, &zz, buffer); safe_unpack32(&((*msg)->lic_array[i]).total, buffer); safe_unpack32(&((*msg)->lic_array[i]).in_use, buffer); - (*msg)->lic_array[i].available = - (*msg)->lic_array[i].total - - (*msg)->lic_array[i].in_use; - xassert((*msg)->lic_array[i].available >= 0); + /* The total number of licenses can decrease + * at runtime. + */ + if ((*msg)->lic_array[i].total < (*msg)->lic_array[i].in_use) + (*msg)->lic_array[i].available = 0; + else + (*msg)->lic_array[i].available = + (*msg)->lic_array[i].total - + (*msg)->lic_array[i].in_use; safe_unpack8(&((*msg)->lic_array[i]).remote, buffer); } diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index eca08576597afda0b2c22efb64d69cbdba410f72..8ddb1c59defa65c432d30115360badc026d665eb 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -174,16 +174,15 @@ static void _dump_job_sched(struct job_record *job_ptr, time_t end_time, xfree(node_list); } -static void _dump_job_test(struct job_record *job_ptr, bitstr_t *avail_bitmap) +static void _dump_job_test(struct job_record *job_ptr, bitstr_t *avail_bitmap, + time_t start_time) { char begin_buf[32], *node_list; - if (job_ptr->start_time == 0) { + if (start_time == 0) strcpy(begin_buf, "NOW"); - } else { - slurm_make_time_str(&job_ptr->start_time, begin_buf, - sizeof(begin_buf)); - } + else + slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf)); node_list = bitmap2node_name(avail_bitmap); info("Test job %u at %s on %s", job_ptr->job_id, begin_buf, node_list); xfree(node_list); @@ -654,6 +653,7 @@ static int _attempt_backfill(void) uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; + bitstr_t *previous_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; @@ -934,6 +934,7 @@ static int _attempt_backfill(void) /* Determine impact of any resource reservations */ later_start = now; + FREE_NULL_BITMAP(previous_bitmap); TRY_LATER: if (slurmctld_config.shutdown_time) break; @@ -945,7 +946,7 @@ static int _attempt_backfill(void) job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; - info("backfill: completed yielding locks 2" + info("backfill: completed yielding locks " "after testing %d jobs, %s", job_test_count, TIME_STR); } @@ -1030,12 +1031,16 @@ static int _attempt_backfill(void) /* Test if insufficient nodes remain OR * required nodes missing OR - * nodes lack features */ + * nodes lack features OR + * no change since previously tested nodes (only changes + * in other partition nodes) */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || - (job_req_node_filter(job_ptr, avail_bitmap))) { + (job_req_node_filter(job_ptr, avail_bitmap)) || + (previous_bitmap && + bit_equal(previous_bitmap, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; @@ -1046,6 +1051,9 @@ static int _attempt_backfill(void) continue; } + FREE_NULL_BITMAP(previous_bitmap); + previous_bitmap = bit_copy(avail_bitmap); + /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); @@ -1061,7 +1069,7 @@ static int _attempt_backfill(void) } if (debug_flags & DEBUG_FLAG_BACKFILL) - _dump_job_test(job_ptr, avail_bitmap); + _dump_job_test(job_ptr, avail_bitmap, start_res); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); @@ -1117,7 +1125,7 @@ static int _attempt_backfill(void) continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad - * happended. */ + * happened. */ job_ptr->start_time = 0; break; } else { @@ -1191,6 +1199,7 @@ static int _attempt_backfill(void) FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); + FREE_NULL_BITMAP(previous_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap);