diff --git a/NEWS b/NEWS index fc0762a199305ac5f351dc3a07243fa522540385..fb98a62796fa3f01046f41ca1802337870520eda 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,10 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - Fix, for when trying to finish a torus on a block already visited. Even though this may be possible electrically this isn't valid in the under lying infrastructure. + -- Fix, in mysql plugins change mediumints to int to support full 32bit + numbers. + -- Add sinfo node state filtering support for NO_RESPOND, POWER_SAVE, FAIL, + and MAINT states. * Changes in SLURM 2.0.0 ======================== @@ -27,7 +31,7 @@ documents those changes that are of interest to users and admins. NUDT. -- Configuration parameter ResumeDelay replaced by SuspendTimeout and ResumeTimeout. - -- BLUEGENE - sview/sinfo now displays correct cnode numbers for drained nodes + -- BLUEGENE - sview/sinfo now displays correct cnode numbers for drained nodes or blocks in error state. -- Fix some batch job launch bugs when powering up suspended nodes. -- Added option '-T' for sacct to truncate time of start and end and set diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 1b82110de6979b3d6d98aef628a8f9f175b185bc..4e92c65a67a04999d76c97b87566d817d4a06455 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -129,6 +129,9 @@ COMMAND CHANGES (see man pages for details) * Time format of all SLURM command set to ISO 8601 (yyyy-mm-ddThh:mm:ss) unless the configure option "--disable-iso8601" is used at build time. * sacct -S to status a job will no longer work. Use sstat from now on. +* sacct --nodes option can be used to filter jobs by allocated node. +* sacct default starttime is midnight of the previous day rather than the + start of the database. * sacct and sstat have been rewritten to have a more sacctmgr like feel * Added the sprio command to view the factors that comprise a job's scheduling priority - works only with the priority/multifactor plugin. diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml index 05344d477e977352de5434bfabc2605922cb10c8..27ba012f0f23c4b78be726c3bde95c5888a08ce3 100644 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -20,7 +20,12 @@ SLURM's support to increase power demands in a gradual fashion.</p> <h2>Configuration</h2> <p>A great deal of flexibility is offered in terms of when and -how idle nodes are put into or removed from power save mode. +how idle nodes are put into or removed from power save mode. +Note that the SLURM control daemon, <i>slurmctld</i>, must be +restarted to initially enable power saving mode. +Changes in the configuration parameters (e.g. <i>SuspendTime</i>) +will take effect after modifying the <i>slurm.conf</i> configuration +file and executing "<i>scontrol reconfig</i>". The following configuration parameters are available: <ul> @@ -188,6 +193,6 @@ In order to minimize this risk, when the <i>slurmctld</i> daemon is started and node which should be allocated to a job fails to respond, the <b>ResumeProgram</b> will be executed (possibly for a second time).</p> -<p style="text-align:center;">Last modified 18 May 2009</p> +<p style="text-align:center;">Last modified 26 May 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index 59c1d17ab4cc028c0b4725114387bb0141b734dc..75f3c760f303487ac2f794918233eb245c9478b5 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -1,4 +1,4 @@ -.TH SINFO "1" "February 2008" "sinfo 2.0" "Slurm components" +.TH SINFO "1" "May 2008" "sinfo 2.0" "Slurm components" .SH "NAME" sinfo \- view information about SLURM nodes and partitions. @@ -257,8 +257,8 @@ default sort value is "N" (increasing node name). List nodes only having the given state(s). Multiple states may be comma separated and the comparison is case insensitive. Possible values include (case insensitive): ALLOC, ALLOCATED, -COMP, COMPLETING, DOWN, DRAIN, DRAINED, DRNG, DRAINING, FAIL, -FAILING, IDLE, MAINT, UNK, and UNKNOWN. +COMP, COMPLETING, DOWN, DRAIN, (DRAINED or DRAINING), FAIL, +FAILING, IDLE, MAINT, NO_RESPOND, POWER_SAVE, UNK, and UNKNOWN. By default nodes in the specified state are reported whether they are responding or not. The \fB\-\-dead\fR and \fB\-\-responding\fR options may be diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index f21cb0f06a7b434ffe15836576dfb5714bcdede4..89eb1a92f2bddc2f21b8366280fe88b06015958a 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -2886,8 +2886,8 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) { "deleted", "tinyint default 0" }, { "name", "tinytext not null" }, { "control_host", "tinytext not null default ''" }, - { "control_port", "mediumint not null default 0" }, - { "rpc_version", "mediumint not null default 0" }, + { "control_port", "int unsigned not null default 0" }, + { "rpc_version", "smallint unsigned not null default 0" }, { "classification", "smallint unsigned default 0" }, { NULL, NULL} }; @@ -2923,10 +2923,10 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) storage_field_t job_table_fields[] = { { "id", "int not null auto_increment" }, { "deleted", "tinyint default 0" }, - { "jobid", "mediumint unsigned not null" }, - { "associd", "mediumint unsigned not null" }, + { "jobid", "int unsigned not null" }, + { "associd", "int unsigned not null" }, { "wckey", "tinytext not null default ''" }, - { "wckeyid", "mediumint unsigned not null" }, + { "wckeyid", "int unsigned not null" }, { "uid", "smallint unsigned not null" }, { "gid", "smallint unsigned not null" }, { "cluster", "tinytext not null" }, @@ -2944,9 +2944,9 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) { "state", "smallint not null" }, { "comp_code", "int default 0 not null" }, { "priority", "int not null" }, - { "req_cpus", "mediumint unsigned not null" }, - { "alloc_cpus", "mediumint unsigned not null" }, - { "alloc_nodes", "mediumint unsigned not null" }, + { "req_cpus", "int unsigned not null" }, + { "alloc_cpus", "int unsigned not null" }, + { "alloc_nodes", "int unsigned not null" }, { "nodelist", "text" }, { "node_inx", "text" }, { "kill_requid", "smallint default -1 not null" }, @@ -2994,7 +2994,7 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) { "name", "text not null" }, { "cluster", "text not null" }, { "deleted", "tinyint default 0" }, - { "cpus", "mediumint unsigned not null" }, + { "cpus", "int unsigned not null" }, { "assoclist", "text not null default ''" }, { "nodelist", "text not null default ''" }, { "node_inx", "text not null default ''" }, @@ -3017,9 +3017,9 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) { "state", "smallint not null" }, { "kill_requid", "smallint default -1 not null" }, { "comp_code", "int default 0 not null" }, - { "nodes", "mediumint unsigned not null" }, - { "cpus", "mediumint unsigned not null" }, - { "tasks", "mediumint unsigned not null" }, + { "nodes", "int unsigned not null" }, + { "cpus", "int unsigned not null" }, + { "tasks", "int unsigned not null" }, { "task_dist", "smallint default 0" }, { "user_sec", "int unsigned default 0 not null" }, { "user_usec", "int unsigned default 0 not null" }, @@ -3027,26 +3027,26 @@ static int _mysql_acct_check_tables(MYSQL *db_conn) { "sys_usec", "int unsigned default 0 not null" }, { "max_vsize", "int unsigned default 0 not null" }, { "max_vsize_task", "smallint unsigned default 0 not null" }, - { "max_vsize_node", "mediumint unsigned default 0 not null" }, + { "max_vsize_node", "int unsigned default 0 not null" }, { "ave_vsize", "float default 0.0 not null" }, { "max_rss", "int unsigned default 0 not null" }, { "max_rss_task", "smallint unsigned default 0 not null" }, - { "max_rss_node", "mediumint unsigned default 0 not null" }, + { "max_rss_node", "int unsigned default 0 not null" }, { "ave_rss", "float default 0.0 not null" }, - { "max_pages", "mediumint unsigned default 0 not null" }, + { "max_pages", "int unsigned default 0 not null" }, { "max_pages_task", "smallint unsigned default 0 not null" }, - { "max_pages_node", "mediumint unsigned default 0 not null" }, + { "max_pages_node", "int unsigned default 0 not null" }, { "ave_pages", "float default 0.0 not null" }, - { "min_cpu", "mediumint unsigned default 0 not null" }, + { "min_cpu", "int unsigned default 0 not null" }, { "min_cpu_task", "smallint unsigned default 0 not null" }, - { "min_cpu_node", "mediumint unsigned default 0 not null" }, + { "min_cpu_node", "int unsigned default 0 not null" }, { "ave_cpu", "float default 0.0 not null" }, { NULL, NULL} }; storage_field_t suspend_table_fields[] = { { "id", "int not null" }, - { "associd", "mediumint not null" }, + { "associd", "int not null" }, { "start", "int unsigned default 0 not null" }, { "end", "int unsigned default 0 not null" }, { NULL, NULL} diff --git a/src/plugins/jobcomp/mysql/jobcomp_mysql.c b/src/plugins/jobcomp/mysql/jobcomp_mysql.c index df19b7777ff3852449f661b6e08fe055cf9d5357..4bac6e5d62e5a4e83775c6609c90f87ca7c7d2db 100644 --- a/src/plugins/jobcomp/mysql/jobcomp_mysql.c +++ b/src/plugins/jobcomp/mysql/jobcomp_mysql.c @@ -97,12 +97,12 @@ storage_field_t jobcomp_table_fields[] = { { "starttime", "int unsigned default 0 not null" }, { "endtime", "int unsigned default 0 not null" }, { "nodelist", "text" }, - { "nodecnt", "mediumint unsigned not null" }, - { "proc_cnt", "mediumint unsigned not null" }, + { "nodecnt", "int unsigned not null" }, + { "proc_cnt", "int unsigned not null" }, { "connect_type", "tinytext" }, { "reboot", "tinytext" }, { "rotate", "tinytext" }, - { "maxprocs", "mediumint unsigned default 0 not null" }, + { "maxprocs", "int unsigned default 0 not null" }, { "geometry", "tinytext" }, { "start", "tinytext" }, { "blockid", "tinytext" }, diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index 77403e3b55ea3e709f350cc0f2338a21dfd65c78..0808433095a1c4b2a5fdd2a4fbff83eafe0b8003 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -4832,6 +4832,7 @@ static int _find_next_free_using_port_2(ba_switch_t *curr_switch, } list_iterator_destroy(itr); + /* check to see if wire 0 is used with this port */ if(curr_switch-> ext_wire[port_to_try].node_tar[X] == curr_switch->ext_wire[0].node_tar[X] @@ -4994,7 +4995,7 @@ static int _finish_torus(List results, } list_iterator_destroy(itr); - /* check to see if this points to itself */ + /* check to see if wire 0 is used with this port */ if((curr_switch-> ext_wire[ports_to_try[i]].node_tar[X] == curr_switch->ext_wire[0].node_tar[X] && @@ -5041,7 +5042,8 @@ static int _finish_torus(List results, } list_iterator_destroy(itr); if(next_node) { - debug3("Can't finish torus with " + debug3("finishing_torus: " + "Can't finish torus with " "%c%c%c we already were there.", alpha_num[next_node->coord[X]], alpha_num[next_node->coord[Y]], diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 7946db4f824f1bd5bc2c4dac4340ec67a718480b..628bc3f34354a6bfe51fb5cf4e7bee74708f3577 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -120,10 +120,6 @@ extern int init_bg(void) list_destroy(bg_conf->ramdisk_list); bg_conf->ramdisk_list = list_create(destroy_image); - bg_conf->smallest_block = 512; - bg_conf->bp_node_cnt = 512; - bg_conf->procs_per_bp = 512; - ba_init(NULL); info("BlueGene plugin loaded successfully"); @@ -1034,7 +1030,17 @@ extern int read_bg_conf(void) bg_conf->quarter_node_cnt = bg_conf->bp_node_cnt/4; } - + /* bg_conf->procs_per_bp should had already been set from the + * node_init */ + if(bg_conf->procs_per_bp < bg_conf->bp_node_cnt) { + fatal("For some reason we have only %u procs per bp, but " + "have %u cnodes per bp. You need at least the same " + "number of procs as you have cnodes per bp. " + "Check the NodeName Procs= " + "definition in the slurm.conf.", + bg_conf->procs_per_bp, bg_conf->bp_node_cnt); + } + bg_conf->proc_ratio = bg_conf->procs_per_bp/bg_conf->bp_node_cnt; if(!bg_conf->proc_ratio) fatal("We appear to have less than 1 proc on a cnode. " @@ -1043,7 +1049,8 @@ extern int read_bg_conf(void) "for each node in the slurm.conf", bg_conf->bp_node_cnt, bg_conf->procs_per_bp); num_unused_cpus = - DIM_SIZE[X] * DIM_SIZE[Y] * DIM_SIZE[Z] * bg_conf->procs_per_bp; + DIM_SIZE[X] * DIM_SIZE[Y] * DIM_SIZE[Z] + * bg_conf->procs_per_bp; if (!s_p_get_uint16( &bg_conf->nodecard_node_cnt, "NodeCardNodeCnt", tbl)) { diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index d2a76fc3d2b7e891ffc05619d84870e637844d40..5bfe7d23dbc85e1335106d4d373acfde379a2066 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -2,7 +2,7 @@ * opts.c - sinfo command line option processing functions ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Joey Ekstrom <ekstrom1@llnl.gov>, Morris Jette <jette1@llnl.gov> * CODE-OCEC-09-009. All rights reserved. @@ -365,16 +365,19 @@ _node_state_list (void) all_states = xstrdup (node_state_string_compact (0)); for (i = 1; i < NODE_STATE_END; i++) { xstrcat (all_states, ","); - xstrcat (all_states, node_state_string_compact(i)); + xstrcat (all_states, node_state_string(i)); } - xstrcat (all_states, ","); - xstrcat (all_states, - node_state_string_compact(NODE_STATE_DRAIN)); - - xstrcat (all_states, ","); - xstrcat (all_states, - node_state_string_compact(NODE_STATE_COMPLETING)); + xstrcat(all_states, ",DRAIN"); + xstrcat(all_states, ","); + xstrcat(all_states, node_state_string(NODE_STATE_COMPLETING)); + xstrcat(all_states, ",NO_RESPOND"); + xstrcat(all_states, ","); + xstrcat(all_states, node_state_string(NODE_STATE_POWER_SAVE)); + xstrcat(all_states, ","); + xstrcat(all_states, node_state_string(NODE_STATE_FAIL)); + xstrcat(all_states, ","); + xstrcat(all_states, node_state_string(NODE_STATE_MAINT)); for (i = 0; i < strlen (all_states); i++) all_states[i] = tolower (all_states[i]); @@ -388,8 +391,8 @@ _node_state_equal (int i, const char *str) { int len = strlen (str); - if ( (strncasecmp (node_state_string_compact(i), str, len) == 0) - || (strncasecmp (node_state_string(i), str, len) == 0)) + if ((strncasecmp(node_state_string_compact(i), str, len) == 0) || + (strncasecmp(node_state_string(i), str, len) == 0)) return (true); return (false); } @@ -404,16 +407,25 @@ static int _node_state_id (char *str) { int i; + int len = strlen (str); + for (i = 0; i < NODE_STATE_END; i++) { if (_node_state_equal (i, str)) return (i); } - if (_node_state_equal (NODE_STATE_DRAIN, str)) + if (strncasecmp("DRAIN", str, len) == 0) return NODE_STATE_DRAIN; - if (_node_state_equal (NODE_STATE_COMPLETING, str)) return NODE_STATE_COMPLETING; + if (strncasecmp("NO_RESPOND", str, len) == 0) + return NODE_STATE_NO_RESPOND; + if (_node_state_equal (NODE_STATE_POWER_SAVE, str)) + return NODE_STATE_POWER_SAVE; + if (_node_state_equal (NODE_STATE_FAIL, str)) + return NODE_STATE_FAIL; + if (_node_state_equal (NODE_STATE_MAINT, str)) + return NODE_STATE_MAINT; return (-1); } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index c3e2acc9505c65a6eb2a09cf1352aac13a93195b..1c06434bb0ea3b65c6fc9142a3ee1d57880bf176 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -433,7 +433,7 @@ struct job_details { uint16_t acctg_freq; /* accounting polling interval */ uint16_t argc; /* count of argv elements */ char **argv; /* arguments for a batch job script */ - time_t begin_time; /* start at this time (srun --being), + time_t begin_time; /* start at this time (srun --begin), * resets to time first eligible * (all dependencies satisfied) */ char *ckpt_dir; /* directory to store checkpoint images */