diff --git a/NEWS b/NEWS index 4547ecb2ddb5d6c93608eaa7efcc8d7466e69ebe..b713cae855101fb9d8fef23812a747a5fcd6fd32 100644 --- a/NEWS +++ b/NEWS @@ -197,6 +197,10 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.03.8 ========================== + -- Fix minor memory leak when Job doesn't have nodes on it (Meaning the job + has finished) + -- Fix sinfo/sview to be able to query against nodes in reserved and other + states. * Changes in Slurm 14.03.7 ========================== diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index 6973ff38f3d944eb1e77f1faec6c2fddb26e4b0b..1a37ade6398a95a4952390a3d27f669c26789b55 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -299,8 +299,14 @@ List nodes only having the given state(s). Multiple states may be comma separated and the comparison is case insensitive. Possible values include (case insensitive): ALLOC, ALLOCATED, COMP, COMPLETING, DOWN, DRAIN (for node in DRAINING or DRAINED -states), DRAINED, DRAINING, FAIL, FAILING, IDLE, MAINT, NO_RESPOND, -POWER_SAVE, UNK, and UNKNOWN. +states), DRAINED, DRAINING, ERR, ERROR, FAIL, FAILING, FUTURE, FUTR, +IDLE, MAINT, MIX, MIXED, NO_RESPOND, NPC, PERFCTRS, +POWER_DOWN, POWER_UP, RESV, RESERVED, UNK, and UNKNOWN. + +Possible states include: allocated, completing, down, +drained, draining, fail, failing, future, idle, maint, mixed, +perfctrs, power_down, power_up, reserved, and unknown plus + By default nodes in the specified state are reported whether they are responding or not. The \fB\-\-dead\fR and \fB\-\-responding\fR options may be @@ -397,9 +403,11 @@ per job's resource allocation. \fBSTATE\fR State of the nodes. Possible states include: allocated, completing, down, -drained, draining, fail, failing, idle, and unknown plus -their abbreviated forms: alloc, comp, donw, drain, drng, -fail, failg, idle, and unk respectively. +drained, draining, fail, failing, future, idle, maint, mixed, +perfctrs, power_down, power_up, reserved, and unknown plus +Their abbreviated forms: alloc, comp, down, drain, drng, +fail, failg, futr, idle, maint, mix, npc, pow_dn, pow_up, resv, +and unk respectively. Note that the suffix "*" identifies nodes that are presently not responding. .TP diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index ab91ad61d517674c8f6f708d3f0c80e740e3a837..fe47b03c0ed2f08f2b700098f4487d012ef9ddd8 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -409,12 +409,15 @@ _node_state_list (void) xstrcat (all_states, node_state_string(i)); } - xstrcat(all_states, ",DRAIN,DRAINED,DRAINING,NO_RESPOND"); + xstrcat(all_states, + ",DRAIN,DRAINED,DRAINING,NO_RESPOND,RESERVED,PERFCTRS"); xstrcat(all_states, ","); xstrcat(all_states, node_state_string(NODE_STATE_COMPLETING)); xstrcat(all_states, ","); xstrcat(all_states, node_state_string(NODE_STATE_POWER_SAVE)); xstrcat(all_states, ","); + xstrcat(all_states, node_state_string(NODE_STATE_POWER_UP)); + xstrcat(all_states, ","); xstrcat(all_states, node_state_string(NODE_STATE_FAIL)); xstrcat(all_states, ","); xstrcat(all_states, node_state_string(NODE_STATE_MAINT)); @@ -460,6 +463,12 @@ _node_state_id (char *str) return NODE_STATE_DRAIN | NODE_STATE_IDLE; if (strncasecmp("ERROR", str, len) == 0) return NODE_STATE_ERROR; + if ((strncasecmp("RESV", str, len) == 0) || + (strncasecmp("RESERVED", str, len) == 0)) + return NODE_STATE_RES; + if ((strncasecmp("PERFCTRS", str, len) == 0) || + (strncasecmp("NPC", str, len) == 0)) + return NODE_STATE_NET; if ((strncasecmp("DRAINING", str, len) == 0) || (strncasecmp("DRNG", str, len) == 0)) return NODE_STATE_DRAIN | NODE_STATE_ALLOCATED; @@ -469,6 +478,8 @@ _node_state_id (char *str) return NODE_STATE_NO_RESPOND; if (_node_state_equal (NODE_STATE_POWER_SAVE, str)) return NODE_STATE_POWER_SAVE; + if (_node_state_equal (NODE_STATE_POWER_UP, str)) + return NODE_STATE_POWER_UP; if (_node_state_equal (NODE_STATE_FAIL, str)) return NODE_STATE_FAIL; if (_node_state_equal (NODE_STATE_MAINT, str)) diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 25a77c7913d79b70bb0195144445462daa50c2a6..0a121d214b325c3008a432b3de3f9b8c9c3fe356 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2674,7 +2674,7 @@ extern void build_node_details(struct job_record *job_ptr, bool new_alloc) if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) { /* No nodes allocated, we're done... */ job_ptr->node_cnt = 0; - job_ptr->node_addr = NULL; + xfree(job_ptr->node_addr); return; } diff --git a/src/sview/common.c b/src/sview/common.c index 929f80ee27517a4e11f4ecfba9d28b5470a55f62..184c310f5d9a5176886e94322a64ec17331a2bfd 100644 --- a/src/sview/common.c +++ b/src/sview/common.c @@ -1914,8 +1914,7 @@ extern void remove_old(GtkTreeModel *model, int updated) gtk_tree_path_free(path); } -extern GtkWidget *create_pulldown_combo(display_data_t *display_data, - int count) +extern GtkWidget *create_pulldown_combo(display_data_t *display_data) { GtkListStore *store = NULL; GtkWidget *combo = NULL; @@ -1924,12 +1923,11 @@ extern GtkWidget *create_pulldown_combo(display_data_t *display_data, int i=0; store = gtk_list_store_new(2, G_TYPE_INT, G_TYPE_STRING); - for(i=0; i<count; i++) { - if (display_data[i].id == -1) - break; + while (display_data[i].id != -1) { gtk_list_store_append(store, &iter); gtk_list_store_set(store, &iter, 0, display_data[i].id, 1, display_data[i].name, -1); + i++; } combo = gtk_combo_box_new_with_model(GTK_TREE_MODEL(store)); diff --git a/src/sview/popups.c b/src/sview/popups.c index 6dec74b42c99df18f47aa87493ce00b1a559cf20..0a1bac90023a649f6a1b22feb92d638d86c5cbd5 100644 --- a/src/sview/popups.c +++ b/src/sview/popups.c @@ -696,7 +696,7 @@ extern void create_search_popup(GtkAction *action, gpointer user_data) }; sview_search_info.search_type = SEARCH_JOB_STATE; - entry = create_pulldown_combo(pulldown_display_data, JOB_END); + entry = create_pulldown_combo(pulldown_display_data); label = gtk_label_new("Which state?"); } else if (!strcmp(name, "partition_name")) { sview_search_info.search_type = SEARCH_PARTITION_NAME; @@ -712,7 +712,7 @@ extern void create_search_popup(GtkAction *action, gpointer user_data) }; sview_search_info.search_type = SEARCH_PARTITION_STATE; - entry = create_pulldown_combo(pulldown_display_data, 5); + entry = create_pulldown_combo(pulldown_display_data); label = gtk_label_new("Which state?"); } else if (!strcmp(name, "node_name")) { sview_search_info.search_type = SEARCH_NODE_NAME; @@ -725,24 +725,39 @@ extern void create_search_popup(GtkAction *action, gpointer user_data) "(ranged or comma separated)"); } else if (!strcmp(name, "node_state")) { display_data_t pulldown_display_data[] = { + {G_TYPE_NONE, NODE_STATE_ALLOCATED, "Allocated", + TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_COMPLETING, "Completing", + TRUE, -1}, {G_TYPE_NONE, NODE_STATE_DOWN, "Down", TRUE, -1}, {G_TYPE_NONE, NODE_STATE_ALLOCATED | NODE_STATE_DRAIN, "Draining", TRUE, -1}, {G_TYPE_NONE, NODE_STATE_IDLE | NODE_STATE_DRAIN, "Drained", TRUE, -1}, - {G_TYPE_NONE, NODE_STATE_IDLE, "Idle", TRUE, -1}, - {G_TYPE_NONE, NODE_STATE_ALLOCATED, "Allocated", - TRUE, -1}, {G_TYPE_NONE, NODE_STATE_ERROR, "Error", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_FAIL, "Fail", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_FAIL | NODE_STATE_ALLOCATED, + "Failing", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_FUTURE, "Future", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_IDLE, "Idle", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_MAINT, "Maint", TRUE, -1}, {G_TYPE_NONE, NODE_STATE_MIXED, "Mixed", TRUE, -1}, - {G_TYPE_NONE, NODE_STATE_COMPLETING, "Completing", - TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_NO_RESPOND, + "No Respond", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_NET | NODE_STATE_IDLE, + "PerfCTRs", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_POWER_SAVE, + "Power Down", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_POWER_UP, + "Power Up", TRUE, -1}, + {G_TYPE_NONE, NODE_STATE_RES | NODE_STATE_IDLE, + "Reserved", TRUE, -1}, {G_TYPE_NONE, NODE_STATE_UNKNOWN, "Unknown", TRUE, -1}, {G_TYPE_NONE, -1, NULL, FALSE, -1} }; sview_search_info.search_type = SEARCH_NODE_STATE; - entry = create_pulldown_combo(pulldown_display_data, PAGE_CNT); + entry = create_pulldown_combo(pulldown_display_data); label = gtk_label_new("Which state?"); } else if ((cluster_flags & CLUSTER_FLAG_BG) && !strcmp(name, "bg_block_name")) { @@ -792,7 +807,7 @@ extern void create_search_popup(GtkAction *action, gpointer user_data) } } sview_search_info.search_type = SEARCH_BLOCK_STATE; - entry = create_pulldown_combo(pulldown_display_data, PAGE_CNT); + entry = create_pulldown_combo(pulldown_display_data); label = gtk_label_new("Which state?"); } else if (!strcmp(name, "reservation_name")) { sview_search_info.search_type = SEARCH_RESERVATION_NAME; diff --git a/src/sview/sview.h b/src/sview/sview.h index 26509fe8068c8f91d72386495f0891323be0a714..a72c080c96b3f232e6c91563ac98d835af5b7207 100644 --- a/src/sview/sview.h +++ b/src/sview/sview.h @@ -628,8 +628,7 @@ extern gboolean delete_popups(void); extern void *popup_thr(popup_info_t *popup_win); extern void set_for_update(GtkTreeModel *model, int updated); extern void remove_old(GtkTreeModel *model, int updated); -extern GtkWidget *create_pulldown_combo(display_data_t *display_data, - int count); +extern GtkWidget *create_pulldown_combo(display_data_t *display_data); extern char *str_tolower(char *upper_str); extern char *get_reason(void); extern void display_admin_edit(GtkTable *table, void *type_msg, int *row, diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 7e2f374a1b9e0a4479a88f1bcf2e59a412caade3..5aae3a7745e1a8b951500722720235a9578994f5 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -303,6 +303,7 @@ EXTRA_DIST = \ test12.5 \ test12.6 \ test12.6.prog.c \ + test12.7 \ test13.1 \ test13.2 \ test14.1 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index 35034c43fe4b991f2b1118d6cdcd93b169113138..7f7df64817c773e190b1ef4f9c533d4c86880920 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -687,6 +687,7 @@ EXTRA_DIST = \ test12.5 \ test12.6 \ test12.6.prog.c \ + test12.7 \ test13.1 \ test13.2 \ test14.1 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 444342391833d17dc5f6f3f1a71558df95e428f4..e3ea9045ea3ff64b2c66291fbef5979e130d07f0 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -426,6 +426,7 @@ test12.3 Test sacct filtering of records by account and job name. test12.4 Test sacct --b, g, j, l, n, p, u, v options. test12.5 Test sacct --helpformat option. test12.6 Test hdf5 acct_gather_profile (--profile=task) +test12.7 Validate that -D shows the correct state when jobs are requeued. test13.# Testing of switch plugins ==================================== diff --git a/testsuite/expect/test12.7 b/testsuite/expect/test12.7 new file mode 100755 index 0000000000000000000000000000000000000000..9a4fbd5e928156c996863fa9797e28c53c9af8bf --- /dev/null +++ b/testsuite/expect/test12.7 @@ -0,0 +1,256 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test of SLURM functionality +# Validate that sacct -D shows correct job steps and states +# when a job is requeued +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2014 SchedMD LLC +# Written by Nathan Yee <nyee32@schedmd.com> +# +# This file is part of SLURM, a resource management program. +# For details, see <http://slurm.schedmd.com/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc. +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id 12.7 +set exit_code 0 +set job_id 0 +set node "" +set file_in "test$test_id\_sc" + +print_header $test_id + +proc mod_state { state } { + + global scontrol node exit_code + + spawn $scontrol update nodename=$node state=$state + expect { + timeout { + send_user "\nFAILURE: scontrol is not responding\n" + set exit_code 1 + } + eof { + wait + } + } +} + +proc check_step { num } { + + global sacct job_id exit_code + + set steps 0 + spawn $sacct --job=$job_id\.batch -D --noheader --format=jobid%30 -P + expect { + -re "batch" { + incr steps 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacct is not responding\n" + set exit_code 1 + } + eof { + wait + } + + } + if {$num != $steps} { + send_user "\nFAILURE: found $steps step(s) when expecting " + send_user "$num steps\n" + set exit_code 1 + } +} + +proc check_sacct_states { states } { + global job_id sacct + + set state_num 0 + spawn $sacct --job=$job_id -D --noheader --format=state -P + expect { + -re ($states) { + incr state_num 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacct is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + return $state_num +} + +make_bash_script $file_in "sleep 2" + +# Start a batch job to get a usable node +spawn $sbatch -t1 --exclusive -o/dev/null $file_in +expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$job_id == 0} { + send_user "\nFAILURE: sbatch did not submit job\n" + exit 1 +} + +wait_for_job $job_id RUNNING + +set found 0 +spawn $scontrol show job $job_id +expect { + -re "NodeList=($alpha_numeric_nodelist)" { + set node $expect_out(1,string) + set found 1 + exp_continue + } + timeout { + send_user "\nFAILURE: scontrol is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$found != 1} { + send_user "\nFAILURE: was not able to get usable node\n" + exit 1 +} + +cancel_job $job_id + +make_bash_script $file_in "sleep 20" + +# Submit job to be requeued +set job_id 0 +spawn $sbatch -N1 -w$node --exclusive -o/dev/null --requeue $file_in +expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$job_id == 0} { + send_user "\nFAILURE: sbatch did not submit job\n" + exit 1 +} + +wait_for_job $job_id RUNNING + +# Set the node that the job is running on to down +mod_state "down" + +# Wait alittle bit for node state to change +sleep 5 + +# Set the node back to resume +mod_state "resume" + +# Check the number of steps +check_step 0 + +# Check the job state +if {[check_sacct_states "NODE_FAIL|PENDING"] != 2} { + send_user "\nFAILURE: jobs state should be NODE_FAIL and PENDING\n" + set exit_code 1 +} + +wait_for_job $job_id RUNNING + +# Check the number of steps after job is running +check_step 0 + +if {[check_sacct_states "NODE_FAIL|RUNNING"] != 2} { + send_user "\nFAILURE: jobs state should be NODE_FAIL and RUNNING\n" + set exit_code 1 +} + +# Requeue the job +spawn $scontrol requeue $job_id +expect { + timeout { + send_user "\nFAILURE: scontrol is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +# Wait a bit for the job to be requeued then check its state +sleep 8 + +if {[check_sacct_states "NODE_FAIL|REQUEUE|CANCELLED|PENDING"] != 4} { + send_user "\nFAILURE: states are not as expected\n" + set exit_code 1 +} + +wait_for_job $job_id RUNNING + +# Check for steps after requeue +check_step 1 + +if {[check_sacct_states "NODE_FAIL|REQUEUE|CANCELLED|RUNNING"] != 4} { + send_user "\nFAILURE: states not as expected\n" + set exit_code 1 +} + +wait_for_job $job_id DONE + +# Check steps after job has completed +check_step 2 + +# COMPLETED will show up 2 times, that is why we are checking for 5 +if {[check_sacct_states "NODE_FAIL|REQUEUE|CANCELLED|COMPLETED"] != 5} { + send_user "\nFAILURE: job states are not as expected\n" + set exit_code 1 +} + +cancel_job $job_id + +if {$exit_code == 0} { + exec $bin_rm $file_in + send_user "\nSUCCCESS\n" +} +exit $exit_code