diff --git a/NEWS b/NEWS index debdd31d355f6db162547769ecc723772bae90ae..c04d49b9b0e63dcd7da42cbf200b32c2247cde2b 100644 --- a/NEWS +++ b/NEWS @@ -187,6 +187,11 @@ documents those changes that are of interest to users and admins. -- Added ApbasilTimeout parameter to the cray.conf configuration file. -- Handle in the API if parts of the node structure are NULL. -- Fix srun hang when IO fails to start at launch. + -- Fix for GRES bitmap not matching the GRES count resulting in abort + (requires manual resetting of GRES count, changes to gres.conf file, + and slurmd restarts). + -- Modify sview to better support job arrays. + -- Modify squeue to support longer job ID values (for many job array tasks). * Changes in Slurm 2.6.4 ======================== diff --git a/src/common/gres.c b/src/common/gres.c index 5d7fdb2fb37ca710797ab86900bf5e0cd6ce58a0..5a6d7dd5759b9ffa7aad3e40a0e8681fa16d7ff6 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -987,7 +987,8 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name) tmp_name, count, node_name); count = 1024; } - gres_context[j].has_file = has_file; + if (has_file) /* Don't clear if already set */ + gres_context[j].has_file = has_file; break; } if (j >= gres_context_cnt) { @@ -1407,7 +1408,7 @@ extern int _node_config_validate(char *node_name, char *orig_config, else if (gres_data->gres_cnt_avail == NO_VAL) gres_data->gres_cnt_avail = 0; - if (context_ptr->has_file) { + if (context_ptr->has_file || gres_data->gres_cnt_avail) { if (gres_data->gres_bit_alloc == NULL) { gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); @@ -1534,7 +1535,7 @@ static int _node_reconfig(char *node_name, char *orig_config, char **new_config, else if (gres_data->gres_cnt_avail == NO_VAL) gres_data->gres_cnt_avail = 0; - if (context_ptr->has_file) { + if (context_ptr->has_file || gres_data->gres_cnt_avail) { if (gres_data->gres_bit_alloc == NULL) { gres_data->gres_bit_alloc = bit_alloc(gres_data->gres_cnt_avail); diff --git a/src/squeue/print.c b/src/squeue/print.c index f63963454c8c0136448f1b3b650c732761046f9c..10e5e52e8bfef9b4097727bebdfe22d529cb442f 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -398,19 +398,27 @@ int _print_job_job_id(job_info_t * job, int width, bool right, char* suffix) } else if ((job->array_task_id != NO_VAL) && !params.array_flag && IS_JOB_PENDING(job) && job->node_inx) { - uint32_t i, max_task_id = 0; - char id[FORMAT_STRING_SIZE], task_str[FORMAT_STRING_SIZE]; + uint32_t i, local_width = width, max_task_id = 0; + char *id, *task_str; bitstr_t *task_bits; for (i = 1; i <= job->node_inx[0]; i++) max_task_id = MAX(max_task_id, job->node_inx[i]); task_bits = bit_alloc(max_task_id + 1); for (i = 1; i <= job->node_inx[0]; i++) bit_set(task_bits, job->node_inx[i]); - bit_fmt(task_str, sizeof(task_str), task_bits); - snprintf(id, FORMAT_STRING_SIZE, "%u_[%s]", + if (local_width == 0) { + local_width = bit_set_count(task_bits) * + FORMAT_STRING_SIZE; + } + id = xmalloc(local_width); + task_str = xmalloc(local_width); + bit_fmt(task_str, local_width, task_bits); + snprintf(id, local_width, "%u_[%s]", job->array_job_id, task_str); _print_str(id, width, right, true); bit_free(task_bits); + xfree(id); + xfree(task_str); } else if (job->array_task_id != NO_VAL) { char id[FORMAT_STRING_SIZE]; snprintf(id, FORMAT_STRING_SIZE, "%u_%u", diff --git a/src/sview/job_info.c b/src/sview/job_info.c index 54fc1f06055dece334dc4c907eaa6246e99c13f0..8672720f9c3378a4044478c5aedab59c41cd4692 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -71,6 +71,8 @@ typedef struct { int job_id; int state; int step_id; + int array_job_id; + int array_task_id; } jobs_foreach_t; typedef struct { @@ -206,7 +208,7 @@ static char *_initial_page_opts = ("JobID,Partition,BG_Block," static display_data_t display_data_job[] = { {G_TYPE_INT, SORTID_POS, NULL, FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, - {G_TYPE_INT, SORTID_JOBID, "JobID", FALSE, EDIT_NONE, refresh_job, + {G_TYPE_STRING, SORTID_JOBID, "JobID", FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_COLOR, NULL, TRUE, EDIT_COLOR, refresh_job, create_model_job, admin_edit_job}, @@ -1604,7 +1606,13 @@ static void _layout_job_record(GtkTreeView *treeview, SELECT_PRINT_RAMDISK_IMAGE)); } - snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->job_id); + if (job_ptr->array_task_id != NO_VAL) { + snprintf(tmp_char, sizeof(tmp_char), "%u_%u (%u)", + job_ptr->array_job_id, job_ptr->array_task_id, + job_ptr->job_id); + } else { + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->job_id); + } add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_JOBID), @@ -1919,8 +1927,8 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, char tmp_cpus_max[40], tmp_mem_min[40], tmp_cpu_req[40]; char tmp_nodes_min[40], tmp_nodes_max[40], tmp_cpus_per_task[40]; char tmp_prio[40], tmp_nice[40], tmp_preempt_time[40]; - char tmp_rqswitch[40], tmp_core_spec[40]; - char tmp_std_err[128], tmp_std_in[128], tmp_std_out[128]; + char tmp_rqswitch[40], tmp_core_spec[40], tmp_job_id[40]; + char tmp_std_err[128], tmp_std_in[128], tmp_std_out[128]; char *tmp_batch, *tmp_cont, *tmp_shared, *tmp_requeue, *tmp_uname; char *tmp_reason, *tmp_nodes; char time_buf[32]; @@ -1936,11 +1944,16 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, job_ptr->alloc_node, job_ptr->alloc_sid); if (job_ptr->array_task_id != NO_VAL) { + snprintf(tmp_job_id, sizeof(tmp_job_id), "%u_%u (%u)", + job_ptr->array_job_id, job_ptr->array_task_id, + job_ptr->job_id); snprintf(tmp_array_job_id, sizeof(tmp_array_job_id), "%u", job_ptr->array_job_id); snprintf(tmp_array_task_id, sizeof(tmp_array_task_id), "%u", job_ptr->array_task_id); } else { + snprintf(tmp_job_id, sizeof(tmp_job_id), "%u", + job_ptr->job_id); snprintf(tmp_array_job_id, sizeof(tmp_array_job_id), "N/A"); snprintf(tmp_array_task_id, sizeof(tmp_array_task_id), "N/A"); } @@ -2166,7 +2179,7 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, SORTID_FEATURES, job_ptr->features, SORTID_GRES, job_ptr->gres, SORTID_GROUP_ID, tmp_group_id, - SORTID_JOBID, job_ptr->job_id, + SORTID_JOBID, tmp_job_id, SORTID_LICENSES, job_ptr->licenses, SORTID_MEM_MIN, tmp_mem_min, SORTID_NAME, job_ptr->name, @@ -2348,9 +2361,15 @@ static void _layout_step_record(GtkTreeView *treeview, SORTID_USER_ID), uname); xfree(uname); - snprintf(tmp_char, sizeof(tmp_char), "%u.%u", - step_ptr->job_id, - step_ptr->step_id); + if (step_ptr->array_job_id) { + snprintf(tmp_char, sizeof(tmp_char), "%u_%u.%u (%u.%u)", + step_ptr->array_job_id, step_ptr->array_task_id, + step_ptr->step_id, + step_ptr->job_id, step_ptr->step_id); + } else { + snprintf(tmp_char, sizeof(tmp_char), "%u.%u", + step_ptr->job_id, step_ptr->step_id); + } add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_JOBID), @@ -2446,6 +2465,7 @@ static void _update_step_record(job_step_info_t *step_ptr, char tmp_nodes[50]; char tmp_cpu_min[40], tmp_time_run[40], tmp_time_limit[40]; char tmp_node_cnt[40], tmp_time_start[40], tmp_task_cnt[40]; + char tmp_step_id[40]; time_t now_time = time(NULL); enum job_states state; int color_inx = step_ptr->step_id % sview_colors_cnt; @@ -2496,6 +2516,8 @@ static void _update_step_record(job_step_info_t *step_ptr, slurm_make_time_str((time_t *)&step_ptr->start_time, tmp_time_start, sizeof(tmp_time_start)); + snprintf(tmp_step_id, sizeof(tmp_step_id), "%u", step_ptr->step_id); + tmp_uname = uid_to_string((uid_t)step_ptr->user_id); gtk_tree_store_set(treestore, iter, @@ -2504,7 +2526,7 @@ static void _update_step_record(job_step_info_t *step_ptr, SORTID_COLOR_INX, color_inx, SORTID_CPUS, tmp_cpu_min, SORTID_GRES, step_ptr->gres, - SORTID_JOBID, step_ptr->step_id, + SORTID_JOBID, tmp_step_id, SORTID_NAME, step_ptr->name, SORTID_NODE_INX, step_ptr->node_inx, SORTID_NODELIST, tmp_nodes, @@ -2581,10 +2603,13 @@ static void _update_info_step(sview_job_info_t *sview_job_info_ptr, sizeof(GtkTreeIter)); } while (1) { + char *tmp_stepid; /* search for the jobid and check to see if - it is in the list */ + * it is in the list */ gtk_tree_model_get(model, step_iter, SORTID_JOBID, - &stepid, -1); + &tmp_stepid, -1); + stepid = atoi(tmp_stepid); + g_free(tmp_stepid); if (stepid == (int)step_ptr->step_id) { /* update with new info */ _update_step_record(step_ptr, @@ -2650,8 +2675,16 @@ static void _update_info_job(List info_list, sview_job_info->iter_set = false; if (sview_job_info->iter_set) { + char *tmp_jobid, *offset; gtk_tree_model_get(model, &sview_job_info->iter_ptr, - SORTID_JOBID, &jobid, -1); + SORTID_JOBID, &tmp_jobid, -1); + offset = strchr(tmp_jobid, '('); + if (offset) + offset++; + else + offset = tmp_jobid; + jobid = atoi(offset); + g_free(tmp_jobid); if (jobid != job_ptr->job_id) /* Bad pointer */ sview_job_info->iter_set = false; } @@ -3234,6 +3267,7 @@ extern void admin_edit_job(GtkCellRendererText *cell, GtkTreePath *path = gtk_tree_path_new_from_string(path_string); GtkTreeIter iter; job_desc_msg_t *job_msg = xmalloc(sizeof(job_desc_msg_t)); + char *tmp_jobid, *offset; char *temp = NULL; char *old_text = NULL; @@ -3249,9 +3283,17 @@ extern void admin_edit_job(GtkCellRendererText *cell, slurm_init_job_desc_msg(job_msg); gtk_tree_model_get(GTK_TREE_MODEL(treestore), &iter, - SORTID_JOBID, &job_msg->job_id, + SORTID_JOBID, &tmp_jobid, column, &old_text, -1); + offset = strchr(tmp_jobid, '('); + if (offset) + offset++; + else + offset = tmp_jobid; + job_msg->job_id = atoi(offset); + g_free(tmp_jobid); + gtk_tree_model_get(GTK_TREE_MODEL(treestore), &iter, SORTID_ALLOC, &stepid, -1); if (stepid) @@ -3787,13 +3829,22 @@ extern void popup_all_job(GtkTreeModel *model, GtkTreeIter *iter, int id) GError *error = NULL; int i=0; char *type; + char *tmp_jobid, *offset; if (cluster_flags & CLUSTER_FLAG_BG) type = "Midplane"; else type = "Node"; - gtk_tree_model_get(model, iter, SORTID_JOBID, &jobid, -1); + gtk_tree_model_get(model, iter, SORTID_JOBID, &tmp_jobid, -1); + offset = strchr(tmp_jobid, '('); + if (offset) + offset++; + else + offset = tmp_jobid; + jobid = atoi(offset); + g_free(tmp_jobid); + gtk_tree_model_get(model, iter, SORTID_ALLOC, &stepid, -1); if (stepid) @@ -4058,10 +4109,22 @@ static void selected_foreach_build_list(GtkTreeModel *model, { uint32_t jobid = NO_VAL; uint32_t stepid = NO_VAL; + uint32_t array_job_id = NO_VAL, array_task_id = NO_VAL; int state; jobs_foreach_t *fe_ptr = NULL; + char *tmp_jobid, *offset, *end_ptr; + + gtk_tree_model_get(model, iter, SORTID_JOBID, &tmp_jobid, -1); + offset = strchr(tmp_jobid, '('); + if (offset) { + array_job_id = strtol(tmp_jobid, &end_ptr, 10); + array_task_id = strtol(end_ptr+1, NULL, 10); + offset++; + } else + offset = tmp_jobid; + jobid = atoi(offset); + g_free(tmp_jobid); - gtk_tree_model_get(model, iter, SORTID_JOBID, &jobid, -1); gtk_tree_model_get(model, iter, SORTID_ALLOC, &stepid, -1); if (stepid) @@ -4078,6 +4141,8 @@ static void selected_foreach_build_list(GtkTreeModel *model, fe_ptr->job_id = jobid; fe_ptr->step_id = stepid; fe_ptr->state = state; + fe_ptr->array_job_id = array_job_id; + fe_ptr->array_task_id = array_task_id; list_append(foreach_list, fe_ptr); /* stuff target away*/ @@ -4086,10 +4151,13 @@ static void selected_foreach_build_list(GtkTreeModel *model, else xstrcat(stacked_job_list, ": "); - if (stepid == NO_VAL) + if (array_task_id == NO_VAL) xstrfmtcat(stacked_job_list, "%u", jobid); else - xstrfmtcat(stacked_job_list, "%u.%u", jobid, stepid); + xstrfmtcat(stacked_job_list, "%u_%u", + array_job_id, array_task_id); + if (stepid != NO_VAL) + xstrfmtcat(stacked_job_list, ".%u", stepid); } static void _edit_each_job (GtkTreeModel *model, GtkTreeIter *iter, @@ -4217,6 +4285,7 @@ extern void admin_job(GtkTreeModel *model, GtkTreeIter *iter, GtkWidget *label = NULL; GtkWidget *entry = NULL; GtkWidget *popup; + char *tmp_jobid, *offset; if (strcmp(type, "Edit Job") == 0) return _edit_jobs(model, iter, type, treeview); @@ -4229,7 +4298,15 @@ extern void admin_job(GtkTreeModel *model, GtkTreeIter *iter, NULL); gtk_window_set_transient_for(GTK_WINDOW(popup), NULL); - gtk_tree_model_get(model, iter, SORTID_JOBID, &jobid, -1); + gtk_tree_model_get(model, iter, SORTID_JOBID, &tmp_jobid, -1); + offset = strchr(tmp_jobid, '('); + if (offset) + offset++; + else + offset = tmp_jobid; + jobid = atoi(offset); + g_free(tmp_jobid); + gtk_tree_model_get(model, iter, SORTID_ALLOC, &stepid, -1); if (stepid) stepid = NO_VAL;