diff --git a/NEWS b/NEWS index cd622a8d928907d211c77fd6d77ab46eb1940650..b68ea5370131bbfa3a32ff3ac70063aef9d24f1d 100644 --- a/NEWS +++ b/NEWS @@ -276,6 +276,13 @@ documents those changes that are of interest to users and administrators. -- sview - Verify pointer before using strchr. -- -M option on tools talking to a Cray from a non-Cray fixed. -- CRAY - Fix rpmbuild issue for missing file slurm.conf.template. + -- Fix race condition when dealing with removing many associations at + different times when reservations are using the associations that are + being deleted. + -- When a node's state is set to power_up, then execute ResumeProgram even if + previously executed for that node. + -- Fix logic determining when job configuration (i.e. running node power up + logic) is complete. * Changes in Slurm 14.03.8 ========================== diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index c8b184e2d769076d7a3454c4534c9c345bb963a1..77e0a872d7497cc4d62b08759bbc971134c63796 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -1769,7 +1769,11 @@ extern void destroy_assoc_mgr_qos_usage(void *object) } } - +/* Since the returned assoc_list is full of pointers from the + * assoc_mgr_association_list assoc_mgr_lock_t READ_LOCK on + * associations must be set before calling this function and while + * handling it after a return. + */ extern int assoc_mgr_get_user_assocs(void *db_conn, slurmdb_association_rec_t *assoc, int enforce, @@ -1778,8 +1782,6 @@ extern int assoc_mgr_get_user_assocs(void *db_conn, ListIterator itr = NULL; slurmdb_association_rec_t *found_assoc = NULL; int set = 0; - assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, - NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; xassert(assoc); xassert(assoc->uid != NO_VAL); @@ -1793,12 +1795,9 @@ extern int assoc_mgr_get_user_assocs(void *db_conn, if (assoc_mgr_refresh_lists(db_conn) == SLURM_ERROR) return SLURM_ERROR; - assoc_mgr_lock(&locks); - if ((!assoc_mgr_association_list || !list_count(assoc_mgr_association_list)) && !(enforce & ACCOUNTING_ENFORCE_ASSOCS)) { - assoc_mgr_unlock(&locks); return SLURM_SUCCESS; } @@ -1814,7 +1813,6 @@ extern int assoc_mgr_get_user_assocs(void *db_conn, set = 1; } list_iterator_destroy(itr); - assoc_mgr_unlock(&locks); if (!set) { debug("UID %u has no associations", assoc->uid); @@ -1827,7 +1825,8 @@ extern int assoc_mgr_get_user_assocs(void *db_conn, extern int assoc_mgr_fill_in_assoc(void *db_conn, slurmdb_association_rec_t *assoc, int enforce, - slurmdb_association_rec_t **assoc_pptr) + slurmdb_association_rec_t **assoc_pptr, + bool locked) { slurmdb_association_rec_t * ret_assoc = NULL; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, @@ -1901,7 +1900,8 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, /* "cluster=%s, partition=%s", */ /* assoc->user, assoc->uid, assoc->acct, */ /* assoc->cluster, assoc->partition); */ - assoc_mgr_lock(&locks); + if (!locked) + assoc_mgr_lock(&locks); /* First look for the assoc with a partition and then check * for the non-partition association if we don't find one. @@ -1915,7 +1915,8 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, } if (!ret_assoc) { - assoc_mgr_unlock(&locks); + if (!locked) + assoc_mgr_unlock(&locks); if (enforce & ACCOUNTING_ENFORCE_ASSOCS) return SLURM_ERROR; else @@ -2004,8 +2005,8 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, if (!assoc->user) assoc->user = ret_assoc->user; - - assoc_mgr_unlock(&locks); + if (!locked) + assoc_mgr_unlock(&locks); return SLURM_SUCCESS; } diff --git a/src/common/assoc_mgr.h b/src/common/assoc_mgr.h index b97011a133fb29f295ea945f1ddee948c2fcf0f1..37961f058ffb873007815b645cba7cd761a9e14c 100644 --- a/src/common/assoc_mgr.h +++ b/src/common/assoc_mgr.h @@ -222,6 +222,11 @@ extern void destroy_assoc_mgr_qos_usage(void *object); * list should be created with list_create(NULL) * since we are putting pointers to memory used elsewhere. * RET: SLURM_SUCCESS on success, else SLURM_ERROR + * + * NOTE: Since the returned assoc_list is full of pointers from the + * assoc_mgr_association_list assoc_mgr_lock_t READ_LOCK on + * associations must be set before calling this function and while + * handling it after a return. */ extern int assoc_mgr_get_user_assocs(void *db_conn, slurmdb_association_rec_t *assoc, @@ -238,12 +243,18 @@ extern int assoc_mgr_get_user_assocs(void *db_conn, * IN/OUT: assoc_pptr - if non-NULL then return a pointer to the * slurmdb_association record in cache on success * DO NOT FREE. + * IN: locked - If you plan on using assoc_pptr after this function + * you need to have an assoc_mgr_lock_t READ_LOCK for + * associations while you use it before and after the + * return. This is not required if using the assoc for + * non-pointer portions. * RET: SLURM_SUCCESS on success, else SLURM_ERROR */ extern int assoc_mgr_fill_in_assoc(void *db_conn, slurmdb_association_rec_t *assoc, int enforce, - slurmdb_association_rec_t **assoc_pptr); + slurmdb_association_rec_t **assoc_pptr, + bool locked); /* * get info from the storage diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index eb6d7496cd566db2e796bfb76e955d26cf615373..cbdd71be9a66015c296c7e9e7cc8fbfb20d2a530 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -97,7 +97,7 @@ static bool _valid_job_assoc(struct job_record *job_ptr) if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, false)) { info("_validate_job_assoc: invalid account or " "partition for uid=%u jobid=%u", job_ptr->user_id, job_ptr->job_id); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b48882ddcf340ccc0a7be4d866da0d6d84ee82ec..9034a0653c9bfdcd0d46ac4f071979a393bdb766 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -2304,7 +2304,7 @@ static void *_assoc_cache_mgr(void *no_data) acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, false)) { verbose("Invalid association id %u " "for job id %u", job_ptr->assoc_id, job_ptr->job_id); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 74b7a638119e408b840c5f2161972589a0d78dca..42f5075d56f99cee378d8e5ba891f72fbfa5a7cf 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1965,7 +1965,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr) && + &job_ptr->assoc_ptr, false) && (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && (!IS_JOB_FINISHED(job_ptr))) { info("Holding job %u with invalid association", job_id); @@ -4857,7 +4857,7 @@ static int _valid_job_part(job_desc_msg_t * job_desc, assoc_mgr_fill_in_assoc( acct_db_conn, &assoc_rec, - accounting_enforce, NULL); + accounting_enforce, NULL, false); } if (assoc_ptr && assoc_rec.id != assoc_ptr->id) { @@ -5239,9 +5239,11 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, assoc_rec.acct = job_desc->account; assoc_rec.partition = part_ptr->name; assoc_rec.uid = job_desc->user_id; - + /* Checks are done later to validate assoc_ptr, so we don't + need to lock outside of fill_in_assoc. + */ if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, - accounting_enforce, &assoc_ptr)) { + accounting_enforce, &assoc_ptr, false)) { info("_job_create: invalid account or partition for user %u, " "account '%s', and partition '%s'", job_desc->user_id, assoc_rec.acct, assoc_rec.partition); @@ -5255,7 +5257,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, * accounting records. */ assoc_rec.acct = NULL; assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, - accounting_enforce, &assoc_ptr); + accounting_enforce, &assoc_ptr, false); if (assoc_ptr) { info("_job_create: account '%s' has no association " "for user %u using default account '%s'", @@ -5264,6 +5266,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, xfree(job_desc->account); } } + if (job_desc->account == NULL) job_desc->account = xstrdup(assoc_rec.acct); @@ -6647,10 +6650,8 @@ void job_time_limit(void) if (IS_JOB_CONFIGURING(job_ptr)) { if (!IS_JOB_RUNNING(job_ptr) || - ((bit_overlap(job_ptr->node_bitmap, - power_node_bitmap) == 0) && - (bit_overlap(job_ptr->node_bitmap, - avail_node_bitmap) == 0))) { + (bit_overlap(job_ptr->node_bitmap, + power_node_bitmap) == 0)) { debug("%s: Configuration for job %u is complete", __func__, job_ptr->job_id); job_ptr->job_state &= (~JOB_CONFIGURING); @@ -8809,7 +8810,7 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs, acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, false)) { info("job_update: invalid account %s " "for job %u", job_specs->account, job_ptr->job_id); @@ -11500,7 +11501,7 @@ extern void job_completion_logger(struct job_record *job_ptr, bool requeue) if (!(assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr))) { + &job_ptr->assoc_ptr, false))) { job_ptr->assoc_id = assoc_rec.id; /* we have to call job start again because the * associd does not get updated in job complete */ @@ -12768,7 +12769,7 @@ extern int update_job_account(char *module, struct job_record *job_ptr, if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, false)) { info("%s: invalid account %s for job_id %u", module, new_account, job_ptr->job_id); return ESLURM_INVALID_ACCOUNT; @@ -12783,7 +12784,7 @@ extern int update_job_account(char *module, struct job_record *job_ptr, assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr); + &job_ptr->assoc_ptr, false); if (!job_ptr->assoc_ptr) { debug("%s: we didn't have an association for account " "'%s' and user '%u', and we can't seem to find " @@ -12902,7 +12903,7 @@ extern int send_jobs_to_accounting(void) acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr) && + &job_ptr->assoc_ptr, false) && (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && (!IS_JOB_FINISHED(job_ptr))) { info("Holding job %u with " diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index f45b2f47db091fd2647c11489e2b646934475fd0..ead619200d68b18fa9f7ac7eac38ab2a8262cf94 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -586,7 +586,8 @@ next_part: part_ptr = (struct part_record *) if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, + false)) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); job_ptr->assoc_id = assoc_rec.id; @@ -1176,7 +1177,8 @@ next_task: if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) { + &job_ptr->assoc_ptr, + false)) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); job_ptr->assoc_id = assoc_rec.id; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index ccdb7fe0fb39b9f1b13deb4366523b043db07f27..b8ca775e4f0437acd7fb34c8b6861b5cd2664290 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1376,8 +1376,18 @@ int update_node ( update_node_msg_t * update_node_msg ) continue; } else if (state_val == NODE_STATE_POWER_UP) { if (!IS_NODE_POWER_SAVE(node_ptr)) { - verbose("node %s already powered up", - this_node_name); + if (IS_NODE_POWER_UP(node_ptr)) { + node_ptr->last_idle = now; + node_ptr->node_state |= + NODE_STATE_POWER_SAVE; + info("power up request " + "repeating for node %s", + this_node_name); + } else { + verbose("node %s is already " + "powered up", + this_node_name); + } } else { node_ptr->last_idle = now; info("powering up node %s", diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 9e85c167083192d40d5092b3a02d332f873a8aff..e9d615c0057bcf404f5377ca5b14360485b013a1 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -506,12 +506,17 @@ static bool _is_account_valid(char *account) assoc_rec.acct = account; if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, - accounting_enforce, &assoc_ptr)) { + accounting_enforce, &assoc_ptr, false)) { return false; } return true; } +/* Since the returned assoc_list is full of pointers from the + * assoc_mgr_association_list assoc_mgr_lock_t READ_LOCK on + * associations must be set before calling this function and while + * handling it after a return. + */ static int _append_assoc_list(List assoc_list, slurmdb_association_rec_t *assoc) { int rc = ESLURM_INVALID_ACCOUNT; @@ -519,7 +524,7 @@ static int _append_assoc_list(List assoc_list, slurmdb_association_rec_t *assoc) if (assoc_mgr_fill_in_assoc( acct_db_conn, assoc, accounting_enforce, - &assoc_ptr)) { + &assoc_ptr, true)) { if (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) { error("No association for user %u and account %s", assoc->uid, assoc->acct); @@ -544,6 +549,9 @@ static int _set_assoc_list(slurmctld_resv_t *resv_ptr) int rc = SLURM_SUCCESS, i = 0, j = 0; List assoc_list_allow = NULL, assoc_list_deny = NULL, assoc_list; slurmdb_association_rec_t assoc, *assoc_ptr = NULL; + assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, + NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + /* no need to do this if we can't ;) */ if (!association_based_accounting) @@ -555,6 +563,7 @@ static int _set_assoc_list(slurmctld_resv_t *resv_ptr) memset(&assoc, 0, sizeof(slurmdb_association_rec_t)); xfree(resv_ptr->assoc_list); + assoc_mgr_lock(&locks); if (resv_ptr->account_cnt && resv_ptr->user_cnt) { if (!resv_ptr->account_not && !resv_ptr->user_not) { /* Add every association that matches both account @@ -565,8 +574,8 @@ static int _set_assoc_list(slurmctld_resv_t *resv_ptr) sizeof(slurmdb_association_rec_t)); assoc.acct = resv_ptr->account_list[j]; assoc.uid = resv_ptr->user_list[i]; - rc = _append_assoc_list(assoc_list_allow, - &assoc); + rc = _append_assoc_list( + assoc_list_allow, &assoc); if (rc != SLURM_SUCCESS) goto end_it; } @@ -675,6 +684,8 @@ static int _set_assoc_list(slurmctld_resv_t *resv_ptr) end_it: list_destroy(assoc_list_allow); list_destroy(assoc_list_deny); + assoc_mgr_unlock(&locks); + return rc; } @@ -3669,7 +3680,7 @@ static int _valid_job_access_resv(struct job_record *job_ptr, acct_db_conn, &assoc_rec, accounting_enforce, (slurmdb_association_rec_t **) - &job_ptr->assoc_ptr)) + &job_ptr->assoc_ptr, false)) goto end_it; } diff --git a/testsuite/expect/test1.59 b/testsuite/expect/test1.59 index ec80998e91f57ab2958a166de595af40be3bd5af..38559b1848d857413abc9fdc7fb17d757ef7f35c 100755 --- a/testsuite/expect/test1.59 +++ b/testsuite/expect/test1.59 @@ -374,30 +374,30 @@ for {set i 0} {$i<5} {incr i} { } if { $i == 1 } { if { [string compare $node0 $1node2] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node1 $1node0] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node2 $1node1] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } } elseif { $i == 2 } { if { [string compare $node0 $2node0] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node1 $2node1] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node2 $2node2] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node3 $2node3] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } elseif { [string compare $node4 $2node4] } { - send_user "\nFAILURE: tasks not distributed by hostfile\n" + send_user "\nFAILURE: tasks not distributed by -w\n" set exit_code 1 } } elseif { $i == 3 } {