diff --git a/NEWS b/NEWS index 7a502ddd0d3761da5ec9aa6bfaf98db41c084d82..45bbb3b2975b8a7377f0b056a36e88c325c9c4a0 100644 --- a/NEWS +++ b/NEWS @@ -76,6 +76,11 @@ documents those changes that are of interest to users and admins. -- Fix bug in tracking memory allocated on a node for select/cons_res plugin. -- Fixed a race condition when writing labelled output with a file per task or per node, which potentially closed a file before all data was written. + -- BLUEGENE - Fix, for if a job comes in spanning both less than and + over 1 midplane in size we check the connection type appropriately. + -- Make sched/backfill properly schedule jobs with constraints having node + counts. NOTE: Backfill of jobs with constraings having exclusive OR + operators are not fully supported. * Changes in SLURM 2.0.1 ======================== diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 80f974bae600f59bc15aa73c5de2b666ff9c4fb5..892c4c3bf661e0c62042c274bde23a31080fcf7c 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -262,9 +262,10 @@ this command as user root!</b></p> There are significant limitations in the current backfill scheduler plugin. It was designed to perform backfill node scheduling for a homogeneous cluster. It does not manage scheduling on individual processors (or other consumable -resources). It also does not update the required or excluded node list of -individual jobs. These are the current limitations. You can use the -scontrol show command to check if these conditions apply.</p> +resources). It does not update the required or excluded node list of +individual jobs. It does support job's with constraints/features unless +the exclusive OR operator is used in the constraint expression. +You can use the scontrol show command to check if these conditions apply.</p> <ul> <li>Partition: State=UP</li> <li>Partition: RootOnly=NO</li> @@ -1203,6 +1204,6 @@ set a different nodeaddr that is known by your other nodes.</p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 7 May 2009</p> +<p style="text-align:center;">Last modified 12 June 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/priority_multifactor.shtml b/doc/html/priority_multifactor.shtml index ba79a4f923e08046eeac0fb8730d9f41b84bc5bb..c27aed7f84dad29596e475928208273829b8eaa8 100644 --- a/doc/html/priority_multifactor.shtml +++ b/doc/html/priority_multifactor.shtml @@ -119,6 +119,11 @@ Job_priority = <a name=fairshare> <h2>Fair-share Factor</h2></a> +<b>Note:</b> Computing the fair-share factor requires the installation +and operation of the <a href="accounting.html">SLURM Accounting +Database</a> to provide the assigned shares and the consumed, +computing resources described below. + <P> The fair-share component to a job's priority influences the order in which a user's queued jobs are scheduled to run based on the portion of the computing resources they have been allocated and the resources their jobs have already consumed. The fair-share factor does not involve a fixed allotment, whereby a user's access to a machine is cut off once that allotment is reached.</P> <P> Instead, the fair-share factor serves to prioritize queued jobs such that those jobs charging accounts that are under-serviced are scheduled first, while jobs charging accounts that are over-serviced are scheduled when the machine would otherwise go idle.</P> @@ -531,7 +536,7 @@ PriorityWeightQOS=0 # don't use the qos factor </PRE> <!--------------------------------------------------------------------------> -<p style="text-align:center;">Last modified 08 April 2009</p> +<p style="text-align:center;">Last modified 12 June 2009</p> <!--#include virtual="footer.txt"--> diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 9402c78bbef38762becbddc4abcfa440fe6fabb6..6f15b8b6356d4a4429ede7eaf18ed4388d2d49b7 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -120,7 +120,11 @@ static void _attempt_backfill(void); static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, char *tv_str, int len_tv_str); static bool _more_work(void); -static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap); +static int _num_feature_count(struct job_record *job_ptr); +static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap); +static int _try_sched(struct job_record *job_ptr, bitstr_t *avail_bitmap, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes); #if __DEBUG /* Log resource allocate table */ @@ -161,6 +165,110 @@ static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, snprintf(tv_str, len_tv_str, "usec=%ld", delta_t); } +/* test if job has feature count specification */ +static int _num_feature_count(struct job_record *job_ptr) +{ + struct job_details *detail_ptr = job_ptr->details; + int rc = 0; + ListIterator feat_iter; + struct feature_record *feat_ptr; + + if (detail_ptr->feature_list == NULL) /* no constraints */ + return rc; + + feat_iter = list_iterator_create(detail_ptr->feature_list); + while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { + if (feat_ptr->count) + rc++; + } + list_iterator_destroy(feat_iter); + + return rc; +} + +/* Attempt to schedule a specific job on specific available nodes + * IN job_ptr - job to schedule + * IN/OUT avail_bitmap - nodes available/selected to use + * RET SLURM_SUCCESS on success, otherwise an error code + */ +static int _try_sched(struct job_record *job_ptr, bitstr_t *avail_bitmap, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes) +{ + bitstr_t *tmp_bitmap; + int rc = SLURM_SUCCESS; + int feat_cnt = _num_feature_count(job_ptr); + + if (feat_cnt) { + /* Ideally schedule the job feature by feature, + * but I don't want to add that complexity here + * right now, so clear the feature counts and try + * to schedule. This will work if there is only + * one feature count. It should work fairly well + * in cases where there are multiple feature + * counts. */ + struct job_details *detail_ptr = job_ptr->details; + ListIterator feat_iter; + struct feature_record *feat_ptr; + int i = 0, list_size; + uint16_t *feat_cnt_orig = NULL, high_cnt = 0; + + /* Clear the feature counts */ + list_size = list_count(detail_ptr->feature_list); + feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size); + feat_iter = list_iterator_create(detail_ptr->feature_list); + while ((feat_ptr = + (struct feature_record *) list_next(feat_iter))) { + high_cnt = MAX(high_cnt, feat_ptr->count); + feat_cnt_orig[i++] = feat_ptr->count; + feat_ptr->count = 0; + } + list_iterator_destroy(feat_iter); + + if ((job_req_node_filter(job_ptr, avail_bitmap) != + SLURM_SUCCESS) || + (bit_set_count(avail_bitmap) < high_cnt)) { + rc = ESLURM_NODES_BUSY; + } else { + rc = select_g_job_test(job_ptr, avail_bitmap, + high_cnt, max_nodes, req_nodes, + SELECT_MODE_WILL_RUN); + } + + /* Restore the feature counts */ + i = 0; + feat_iter = list_iterator_create(detail_ptr->feature_list); + while ((feat_ptr = + (struct feature_record *) list_next(feat_iter))) { + feat_ptr->count = feat_cnt_orig[i++]; + } + list_iterator_destroy(feat_iter); + xfree(feat_cnt_orig); + } else { + /* Try to schedule the job. First on dedicated nodes + * then on shared nodes (if so configured). */ + uint16_t orig_shared; + orig_shared = job_ptr->details->shared; + job_ptr->details->shared = 0; + tmp_bitmap = bit_copy(avail_bitmap); + rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, + max_nodes, req_nodes, + SELECT_MODE_WILL_RUN); + job_ptr->details->shared = orig_shared; + if ((rc != SLURM_SUCCESS) && (orig_shared != 0)) { + FREE_NULL_BITMAP(avail_bitmap); + avail_bitmap= tmp_bitmap; + rc = select_g_job_test(job_ptr, avail_bitmap, + min_nodes, max_nodes, req_nodes, + SELECT_MODE_WILL_RUN); + } else + FREE_NULL_BITMAP(tmp_bitmap); + } + + return rc; + +} + /* Terminate backfill_agent */ extern void stop_backfill_agent(void) { @@ -230,8 +338,7 @@ static void _attempt_backfill(void) struct part_record *part_ptr; uint32_t end_time, end_reserve, time_limit; uint32_t min_nodes, max_nodes, req_nodes; - uint16_t orig_shared; - bitstr_t *avail_bitmap = NULL, *tmp_bitmap; + bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; time_t now = time(NULL), start_res; node_space_map_t node_space[MAX_BACKFILL_JOB_CNT + 2]; @@ -334,8 +441,12 @@ static void _attempt_backfill(void) if ((j = node_space[j].next) == 0) break; } - if (job_req_node_filter(job_ptr, avail_bitmap)) - continue; /* problem with features */ + + /* Identify nodes which are definitely off limits */ + FREE_NULL_BITMAP(resv_bitmap); + resv_bitmap = bit_copy(avail_bitmap); + bit_not(resv_bitmap); + if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, @@ -348,36 +459,24 @@ static void _attempt_backfill(void) continue; /* required nodes missing */ if (bit_set_count(avail_bitmap) < min_nodes) continue; /* insufficient nodes remain */ + if (job_req_node_filter(job_ptr, avail_bitmap)) + continue; /* nodes lack features */ - /* Try to schedule the job. First on dedicated nodes - * then on shared nodes (if so configured). */ - orig_shared = job_ptr->details->shared; - job_ptr->details->shared = 0; - tmp_bitmap = bit_copy(avail_bitmap); - j = select_g_job_test(job_ptr, avail_bitmap, min_nodes, - max_nodes, req_nodes, - SELECT_MODE_WILL_RUN); - job_ptr->details->shared = orig_shared; - if ((j != SLURM_SUCCESS) && (orig_shared != 0)) { - FREE_NULL_BITMAP(avail_bitmap); - avail_bitmap= tmp_bitmap; - j = select_g_job_test(job_ptr, avail_bitmap, min_nodes, - max_nodes, req_nodes, - SELECT_MODE_WILL_RUN); - } else - FREE_NULL_BITMAP(tmp_bitmap); + j = _try_sched(job_ptr, avail_bitmap, + min_nodes, max_nodes, req_nodes); if (j != SLURM_SUCCESS) continue; /* not runable */ job_ptr->start_time = MAX(job_ptr->start_time, start_res); if (job_ptr->start_time <= now) { - int rc = _start_job(job_ptr, avail_bitmap); - if(rc == ESLURM_ACCOUNTING_POLICY) + int rc = _start_job(job_ptr, resv_bitmap); + if (rc == ESLURM_ACCOUNTING_POLICY) + continue; + else if (rc != SLURM_SUCCESS) + /* Planned to start job, but something bad + * happended. Reserve nodes where this should + * apparently run and try more jobs. */ continue; - else if(rc != SLURM_SUCCESS) - /* Planned to start job, but something - * bad happended */ - break; } if (job_ptr->start_time > (now + BACKFILL_WINDOW)) { /* Starts too far in the future to worry about */ @@ -401,6 +500,7 @@ static void _attempt_backfill(void) #endif } FREE_NULL_BITMAP(avail_bitmap); + FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { bit_free(node_space[i].avail_bitmap); @@ -410,16 +510,18 @@ static void _attempt_backfill(void) xfree(job_queue); } -static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap) +/* Try to start the job on any non-reserved nodes */ +static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) { int rc; bitstr_t *orig_exc_nodes = NULL; static uint32_t fail_jobid = 0; - if (job_ptr->details->exc_node_bitmap) + if (job_ptr->details->exc_node_bitmap) { orig_exc_nodes = job_ptr->details->exc_node_bitmap; - job_ptr->details->exc_node_bitmap = bit_copy(avail_bitmap); - bit_not(job_ptr->details->exc_node_bitmap); + bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); + } else + job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); rc = select_nodes(job_ptr, false, NULL); bit_free(job_ptr->details->exc_node_bitmap); @@ -437,13 +539,15 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap) #if __DEBUG info("backfill: Jobs backfilled: %d", backfilled_jobs); #endif - } else if ((job_ptr->job_id != fail_jobid) - && (rc != ESLURM_ACCOUNTING_POLICY)) { - char *node_list = bitmap2node_name(avail_bitmap); + } else if ((job_ptr->job_id != fail_jobid) && + (rc != ESLURM_ACCOUNTING_POLICY)) { + char *node_list; + bit_not(resv_bitmap); + node_list = bitmap2node_name(resv_bitmap); /* This happens when a job has sharing disabled and * a selected node is still completing some job, * which should be a temporary situation. */ - verbose("backfill: Failed to start JobId=%u on %s: %s", + verbose("backfill: Failed to start JobId=%u in %s: %s", job_ptr->job_id, node_list, slurm_strerror(rc)); xfree(node_list); fail_jobid = job_ptr->job_id; diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index b576da5418e53237351485a0fd7fb9c6dab2306f..4acd56f0053ccef3e00b793831b8e09d69859377 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -478,7 +478,16 @@ static bg_record_t *_find_matching_block(List block_list, continue; } goto good_conn_type; - } + } else if(bg_record->conn_type >= SELECT_SMALL) { + /* since we already checked to see if + the cpus were good this means we are + looking for a block in a range that + includes small and regular blocks. + So we can just continue on. + */ + goto good_conn_type; + } + #endif debug("bg block %s conn-type not usable asking for %s " "bg_record is %s", diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 139f29c1cc439cdff5335198558f7ccd360add5b..8b514cc7075594425737385dc1181f84919c86b1 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1297,8 +1297,9 @@ scontrol [<OPTION>] [<COMMAND>] \n\ !! Repeat the last command entered. \n\ \n\ <ENTITY> may be \"config\", \"daemons\", \"job\", \"node\", \"partition\"\n\ - \"reservation\", \"hostlist\", \"hostnames\", \"slurmd\", \"topology\"\n\ - (for BlueGene only: \"block\", \"subbp\" or \"step\"). \n\ + \"reservation\", \"hostlist\", \"hostnames\", \"slurmd\", \n\ + \"topology\", or \"step\" \n\ + (also for BlueGene only: \"block\" or \"subbp\"). \n\ \n\ <ID> may be a configuration parameter name, job id, node name, partition \n\ name, reservation name, job step id, or hostlist or pathname to a \n\ diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 6b09a41a3c416bc10b2607860f307cf8e58e00a6..a17b516db42a735e5c4eab3e8a5af756cec39ec1 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -108,8 +108,11 @@ static int _pick_best_nodes(struct node_set *node_set_ptr, struct part_record *part_ptr, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bool test_only); +static void _reset_feature_counts(struct job_details *details_ptr); +static bool _valid_feature_counts(struct job_details *details_ptr); static bitstr_t *_valid_features(struct job_details *detail_ptr, - struct config_record *config_ptr); + struct config_record *config_ptr, + bool update_count); /* @@ -438,9 +441,10 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, #if 0 { char *tmp_str = bitmap2node_name(feature_bitmap); - info("job %u needs %u nodes with feature %s, using %s", - job_ptr->job_id, feat_ptr->count, - feat_ptr->name, tmp_str); + info("job %u needs %u nodes with feature %s, " + "using %s, error_code=%d", + job_ptr->job_id, feat_ptr->count, + feat_ptr->name, tmp_str, error_code); xfree(tmp_str); } #endif @@ -1145,11 +1149,48 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, return error_code; } +/* Clear tmp_cnt for all features of given job */ +static void _reset_feature_counts(struct job_details *details_ptr) +{ + ListIterator feat_iter; + struct feature_record *feat_ptr; + + if (details_ptr->feature_list == NULL) /* no constraints */ + return; + + feat_iter = list_iterator_create(details_ptr->feature_list); + while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { + feat_ptr->tmp_cnt = 0; + } + list_iterator_destroy(feat_iter); +} + +/* Verify that tmp_cnt >= count for all features of given job */ +static bool _valid_feature_counts(struct job_details *details_ptr) +{ + ListIterator feat_iter; + struct feature_record *feat_ptr; + bool result = true; + + if (details_ptr->feature_list == NULL) /* no constraints */ + return result; + + feat_iter = list_iterator_create(details_ptr->feature_list); + while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { + if (feat_ptr->tmp_cnt >= feat_ptr->count) + continue; + result = false; + break; + } + list_iterator_destroy(feat_iter); + return result; +} + /* * job_req_node_filter - job reqeust node filter. * clear from a bitmap the nodes which can not be used for a job * test memory size, required features, processor count, etc. - * NOTE: Does not support exclusive OR of features or feature counts. + * NOTE: Does not support exclusive OR of features. * It just matches first element of XOR and ignores count. * IN job_ptr - pointer to node to be scheduled * IN/OUT bitmap - set of nodes being considered for use @@ -1171,13 +1212,14 @@ extern int job_req_node_filter(struct job_record *job_ptr, return EINVAL; } + _reset_feature_counts(detail_ptr); mc_ptr = detail_ptr->mc_ptr; for (i=0; i< node_record_count; i++) { if (!bit_test(avail_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; config_ptr = node_ptr->config_ptr; - feature_bitmap = _valid_features(detail_ptr, config_ptr); + feature_bitmap = _valid_features(detail_ptr, config_ptr, true); if ((feature_bitmap == NULL) || (!bit_test(feature_bitmap, 0))) { bit_clear(avail_bitmap, i); @@ -1228,6 +1270,10 @@ extern int job_req_node_filter(struct job_record *job_ptr, } } FREE_NULL_BITMAP(feature_bitmap); + + if (!_valid_feature_counts(detail_ptr)) + return EINVAL; + return SLURM_SUCCESS; } @@ -1354,7 +1400,8 @@ static int _build_node_list(struct job_record *job_ptr, continue; } - tmp_feature = _valid_features(job_ptr->details, config_ptr); + tmp_feature = _valid_features(job_ptr->details, config_ptr, + false); if (tmp_feature == NULL) { FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); continue; @@ -1594,6 +1641,8 @@ extern void build_node_details(struct job_record *job_ptr) * the available nodes * IN details_ptr - job requirement details, includes requested features * IN config_ptr - node's configuration record + * IN update_count - if set, then increment tmp_cnt (temporary counter) + * for matched features * RET NULL if request is not satisfied, otherwise a bitmap indicating * which mutually exclusive features are satisfied. For example * _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns a bitmap with @@ -1604,34 +1653,44 @@ extern void build_node_details(struct job_record *job_ptr) * mutually exclusive feature list. */ static bitstr_t *_valid_features(struct job_details *details_ptr, - struct config_record *config_ptr) + struct config_record *config_ptr, + bool update_count) { bitstr_t *result_bits = (bitstr_t *) NULL; ListIterator feat_iter; struct feature_record *feat_ptr; - int found, last_op, position = 0, result; - int save_op = FEATURE_OP_AND, save_result=1; + bool found, test_names, result; + int last_op, position = 0; + int save_op = FEATURE_OP_AND, save_result = 1; - if (details_ptr->feature_list == NULL) {/* no constraints */ + if (details_ptr->feature_list == NULL) { /* no constraints */ result_bits = bit_alloc(MAX_FEATURES); bit_set(result_bits, 0); return result_bits; } - result = 1; /* assume good for now */ + result = true; /* assume good for now */ last_op = FEATURE_OP_AND; feat_iter = list_iterator_create(details_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { - found = 0; - if (feat_ptr->count) - found = 1; - else if (config_ptr->feature_array) { + test_names = false; + found = false; + if (feat_ptr->count) { + found = true; + if (update_count) + test_names = true; + } else + test_names = true; + + if (test_names && config_ptr->feature_array) { int i; for (i=0; config_ptr->feature_array[i]; i++) { if (strcmp(feat_ptr->name, config_ptr->feature_array[i])) continue; - found = 1; + found = true; + if (update_count && feat_ptr->count) + feat_ptr->tmp_cnt++; break; } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index afcc54e11796f81c9b3b713ebec35dca7e625a6f..c828d7cb3bcc26ea0455dae6ddad3c2d329b0cc5 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -381,6 +381,7 @@ struct feature_record { char *name; /* name of feature */ uint16_t count; /* count of nodes with this feature */ uint8_t op_code; /* separator, see FEATURE_OP_ above */ + uint16_t tmp_cnt; /* temporary, allocated node counter */ }; /* job_details - specification of a job's constraints, diff --git a/testsuite/expect/globals b/testsuite/expect/globals index e3fb8db94a2f40bbe144b1053555a456d05d2544..e9520b849dace7da6451c6ae4946de6845a3ec32 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -624,8 +624,8 @@ proc test_assoc_enforced { } { set assoc_enforced 0 spawn $scontrol show config expect { - -re "AccountingStorageEnforce *= ($number)" { - set assoc_enforced $expect_out(1,string) + -re "AccountingStorageEnforce *= associations" { + set assoc_enforced 1 exp_continue } eof {