diff --git a/NEWS b/NEWS index 0f941190e12c0d9d50ac49b3c90cee4659e071ff..cc3d07157a9985e01be09dfa36530b7a2532d519 100644 --- a/NEWS +++ b/NEWS @@ -81,6 +81,9 @@ documents those changes that are of interest to users and administrators. -- Functions slurm_load_node() slurm_load_partitions() modified to show all nodes/partitions in a federation when the SHOW_GLOBAL flag is used. +* Changes in Slurm 17.02.4 +========================== + * Changes in Slurm 17.02.3 ========================== -- Increase --cpu_bind and --mem_bind field length limits. @@ -137,6 +140,14 @@ documents those changes that are of interest to users and administrators. -- MYSQL - Fix memory leak when loading archived jobs into the database. -- Fix potential race condition when starting the priority/multifactor plugin's decay thread. + -- Sanity check to make sure we have started a job in acct_policy.c before we + clear it as started. + -- Allow reboot program to use arguments. + -- Message Aggr - Remove race condition on slurmd shutdown with respects to + destroying a mutex. + -- Fix updating job priority on multiple partitions to be correct. + -- Don't remove admin comment when updating a job. + -- Return error when bad separator is given for scontrol update job licenses. * Changes in Slurm 17.02.2 ========================== diff --git a/src/common/msg_aggr.c b/src/common/msg_aggr.c index 9d371b632bd2659e5681104e1d3d816853eda9ca..5ab41f17c1d5f441a0aaca9117777c9506f63323 100644 --- a/src/common/msg_aggr.c +++ b/src/common/msg_aggr.c @@ -333,7 +333,6 @@ extern void msg_aggr_sender_fini(void) FREE_NULL_LIST(msg_collection.msg_aggr_list); slurm_mutex_unlock(&msg_collection.aggr_mutex); FREE_NULL_LIST(msg_collection.msg_list); - slurm_mutex_destroy(&msg_collection.aggr_mutex); slurm_mutex_destroy(&msg_collection.mutex); } @@ -342,6 +341,7 @@ extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait, { int count; static uint16_t msg_index = 1; + static uint32_t wait_count = 0; if (!msg_collection.running) return; @@ -388,12 +388,16 @@ extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait, timeout.tv_sec = now.tv_sec + msg_timeout; timeout.tv_nsec = now.tv_usec * 1000; + wait_count++; if (pthread_cond_timedwait(&msg_aggr->wait_cond, &msg_collection.aggr_mutex, &timeout) == ETIMEDOUT) _handle_msg_aggr_ret(msg_aggr->msg_index, 1); + wait_count--; slurm_mutex_unlock(&msg_collection.aggr_mutex); - +; + if (!msg_collection.running && !wait_count) + slurm_mutex_destroy(&msg_collection.aggr_mutex); _msg_aggr_free(msg_aggr); } diff --git a/src/common/slurm_selecttype_info.c b/src/common/slurm_selecttype_info.c index b38b837e875be6ee580e5c54d6053f2ca21f326b..9c50e812247cfe0cddfa06b2c4b0b8068a10ed39 100644 --- a/src/common/slurm_selecttype_info.c +++ b/src/common/slurm_selecttype_info.c @@ -52,7 +52,6 @@ int parse_select_type_param(char *select_type_parameters, uint16_t *param) int rc = SLURM_SUCCESS; char *str_parameters, *st_str = NULL; int param_cnt = 0; - bool one_task = false; *param = 0; st_str = xstrdup(select_type_parameters); @@ -82,9 +81,6 @@ int parse_select_type_param(char *select_type_parameters, uint16_t *param) *param |= CR_CPU; *param |= CR_MEMORY; param_cnt++; - } else if (!xstrcasecmp(str_parameters, - "CR_ONE_TASK_PER_CORE")) { - one_task = true; } else if (!xstrcasecmp(str_parameters, "other_cons_res")) { *param |= CR_OTHER_CONS_RES; } else if (!xstrcasecmp(str_parameters, @@ -119,7 +115,7 @@ int parse_select_type_param(char *select_type_parameters, uint16_t *param) return rc; } - if ((*param & CR_CPU) && one_task) { + if ((*param & CR_CPU) && (*param & CR_ONE_TASK_PER_CORE)) { error("CR_ONE_TASK_PER_CORE is not compatible with CR_CPU*, please change to use CR_CORE* instead."); rc = SLURM_ERROR; xfree(st_str); diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index a89384d390f607331ca85482ae121ac3dbcb6174..e3f5c57897462fde669000c2fc1e01bb23082b2b 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -546,6 +546,13 @@ static void _qos_adjust_limit_usage(int type, struct job_record *job_ptr, used_limits_a->jobs++; break; case ACCT_POLICY_JOB_FINI: + /* + * If tres_alloc_cnt doesn't exist means ACCT_POLICY_JOB_BEGIN + * was never called so no need to clean up that which was never + * set up. + */ + if (!job_ptr->tres_alloc_cnt) + break; qos_ptr->usage->grp_used_jobs--; if ((int32_t)qos_ptr->usage->grp_used_jobs < 0) { qos_ptr->usage->grp_used_jobs = 0; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 8fe016a86f21b86d864febc2a6726fb525e1aed1..a4a1faa234ef5913de379959d1c40cfdcbe8c627 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -10960,23 +10960,29 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs, } /* Always do this last just in case the assoc_ptr changed */ - if (job_specs->admin_comment && !validate_super_user(uid)) { - error("Attempt to change admin_comment for job %u", - job_ptr->job_id); - error_code = ESLURM_ACCESS_DENIED; - } else if (job_specs->admin_comment && - (job_specs->admin_comment[0] == '+') && - (job_specs->admin_comment[1] == '=')) { - if (job_ptr->admin_comment) - xstrcat(job_ptr->admin_comment, ","); - xstrcat(job_ptr->admin_comment, job_specs->admin_comment + 2); - info("update_job: setting admin_comment to %s for job_id %u", - job_ptr->admin_comment, job_ptr->job_id); - } else { - xfree(job_ptr->admin_comment); - job_ptr->admin_comment = xstrdup(job_specs->admin_comment); - info("update_job: setting admin_comment to %s for job_id %u", - job_ptr->admin_comment, job_ptr->job_id); + if (job_specs->admin_comment) { + if (!validate_super_user(uid)) { + error("Attempt to change admin_comment for job %u", + job_ptr->job_id); + error_code = ESLURM_ACCESS_DENIED; + } else if ((job_specs->admin_comment[0] == '+') && + (job_specs->admin_comment[1] == '=')) { + if (job_ptr->admin_comment) + xstrcat(job_ptr->admin_comment, ","); + xstrcat(job_ptr->admin_comment, + job_specs->admin_comment + 2); + info("update_job: adding to admin_comment it is now %s for job_id %u", + job_ptr->admin_comment, job_ptr->job_id); + } else if (!xstrcmp(job_ptr->admin_comment, + job_specs->admin_comment)) { + info("update_job: admin_comment the same as before, not changing"); + } else { + xfree(job_ptr->admin_comment); + job_ptr->admin_comment = + xstrdup(job_specs->admin_comment); + info("update_job: setting admin_comment to %s for job_id %u", + job_ptr->admin_comment, job_ptr->job_id); + } } /* Always do this last just in case the assoc_ptr changed */ @@ -11472,6 +11478,15 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs, } else error_code = ESLURM_PRIO_RESET_FAIL; job_ptr->priority = job_specs->priority; + if (job_ptr->part_ptr_list && + job_ptr->priority_array) { + int i, j = list_count( + job_ptr->part_ptr_list); + for (i = 0; i < j; i++) { + job_ptr->priority_array[i] = + job_specs->priority; + } + } } info("sched: update_job: setting priority to %u for " "job_id %u", job_ptr->priority, diff --git a/src/slurmctld/licenses.c b/src/slurmctld/licenses.c index 0e65617acf74ad03334f6a56ddcd4732759645d6..2a7053e3f580190e09bf59fe337f37e2a55b2b6b 100644 --- a/src/slurmctld/licenses.c +++ b/src/slurmctld/licenses.c @@ -148,9 +148,12 @@ static List _build_license_list(char *licenses, bool *valid) if ((token[i] == ':') || (token[i] == '*')) { token[i++] = '\0'; num = (int32_t)strtol(&token[i], &end_num, 10); + if (*end_num != '\0') + *valid = false; + break; } } - if (num < 0) { + if (num < 0 || !(*valid)) { *valid = false; break; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 9f27685201073e5c724ab3a1e604c06507a0a33b..8944e75c41f84b55a50192faee7b9be2239baf2a 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2511,10 +2511,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) { /* Leave job queued, something is hosed */ error("select_g_job_begin(%u): %m", job_ptr->job_id); - - /* Since we began the job earlier we have to cancel it */ - (void) bb_g_job_cancel(job_ptr); - error_code = ESLURM_NODES_BUSY; job_ptr->start_time = 0; job_ptr->time_last_active = 0; @@ -2534,10 +2530,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, error("Select plugin failed to set job resources, nodes"); /* Do not attempt to allocate the select_bitmap nodes since * select plugin failed to set job resources */ - - /* Since we began the bb job earlier we have to cancel it */ - (void) bb_g_job_cancel(job_ptr); - error_code = ESLURM_NODES_BUSY; job_ptr->start_time = 0; job_ptr->time_last_active = 0; @@ -2557,11 +2549,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, if (!job_ptr->job_resrcs) { /* If we don't exit earlier the empty job_resrcs might * be dereferenced later */ - - /* Since we began the bb job earlier we have to cancel - * it */ - (void) bb_g_job_cancel(job_ptr); - error_code = ESLURM_NODES_BUSY; job_ptr->start_time = 0; job_ptr->time_last_active = 0; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 907e4678db16bf7fd3ce9b1c505548bad47383a7..3f1e661123f98b80721e069852f80dceebb1675a 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2567,6 +2567,10 @@ _rpc_reboot(slurm_msg_t *msg) sp = xstrdup(reboot_program); reboot_msg = (reboot_msg_t *) msg->data; if (reboot_msg && reboot_msg->features) { + /* + * Run reboot_program with only arguments given + * in reboot_msg->features. + */ info("Node reboot request with features %s being processed", reboot_msg->features); (void) node_features_g_node_set( @@ -2578,7 +2582,8 @@ _rpc_reboot(slurm_msg_t *msg) cmd = xstrdup(sp); } } else { - cmd = xstrdup(sp); + /* Run reboot_program verbatim */ + cmd = xstrdup(reboot_program); info("Node reboot request being processed"); } if (access(sp, R_OK | X_OK) < 0) diff --git a/testsuite/expect/test1.88 b/testsuite/expect/test1.88 index be9041c939ea5ccf155de6e24ab6a21c17f6ec74..01ecf52600b1497d38da6dd550b193c86ef5a21d 100755 --- a/testsuite/expect/test1.88 +++ b/testsuite/expect/test1.88 @@ -109,7 +109,7 @@ make_bash_script $file_in " # set timeout $max_job_delay set no_start 0 -set sbatch_pid [spawn $sbatch -N1-3 -n6 --output=$file_out --error=$file_err -t1 $file_in] +set sbatch_pid [spawn $sbatch -N1-6 -n6 --output=$file_out --error=$file_err -t1 $file_in] expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string)