From 1e38eeb4dc1c3ab85ba1baecf559c2db99abaf8d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 8 Sep 2008 21:59:42 +0000 Subject: [PATCH] svn merge -r14958:15006 https://eris.llnl.gov/svn/slurm/branches/slurm-1.3 --- NEWS | 10 + configure | 2 +- doc/html/faq.shtml | 9 +- doc/html/team.shtml | 4 +- doc/man/man5/slurm.conf.5 | 16 +- etc/init.d.slurmdbd | 34 +- src/api/allocate.c | 2 +- src/common/assoc_mgr.c | 27 +- src/common/assoc_mgr.h | 9 +- src/common/read_config.c | 12 +- .../filetxt/accounting_storage_filetxt.c | 10 +- .../mysql/accounting_storage_mysql.c | 130 +- .../mysql/mysql_jobacct_process.c | 16 +- .../pgsql/accounting_storage_pgsql.c | 39 +- src/plugins/sched/wiki/job_modify.c | 9 +- src/plugins/sched/wiki2/job_modify.c | 8 +- .../block_allocator/block_allocator.c | 1064 ++++++----------- .../select/bluegene/plugin/bg_job_place.c | 11 +- src/plugins/select/bluegene/plugin/bluegene.c | 1 - .../select/bluegene/plugin/slurm_prolog.c | 5 +- src/sacct/options.c | 18 +- src/sacctmgr/user_functions.c | 19 + src/salloc/Makefile.am | 2 +- src/salloc/Makefile.in | 2 +- src/salloc/salloc.c | 115 ++ src/slurmctld/controller.c | 5 + src/slurmctld/job_mgr.c | 113 +- src/slurmctld/job_scheduler.c | 4 + src/slurmctld/proc_req.c | 13 +- src/slurmctld/slurmctld.h | 12 +- src/slurmctld/step_mgr.c | 2 +- src/slurmd/slurmd/slurmd.c | 6 +- src/slurmd/slurmstepd/mgr.c | 39 +- src/slurmd/slurmstepd/req.c | 14 +- src/slurmdbd/slurmdbd.c | 6 +- src/smap/configure_functions.c | 2 +- src/srun/allocate.c | 2 +- src/srun/srun.c | 12 +- testsuite/expect/globals | 51 + testsuite/expect/test21.10 | 129 +- testsuite/expect/test21.11 | 8 + testsuite/expect/test21.12 | 8 + testsuite/expect/test21.13 | 7 + testsuite/expect/test21.14 | 5 + testsuite/expect/test21.15 | 4 + testsuite/expect/test21.16 | 6 +- testsuite/expect/test21.17 | 5 + testsuite/expect/test21.18 | 5 + testsuite/expect/test21.19 | 5 + testsuite/expect/test21.5 | 5 + testsuite/expect/test21.6 | 71 +- testsuite/expect/test21.7 | 10 +- testsuite/expect/test21.8 | 8 + testsuite/expect/test21.9 | 8 + testsuite/expect/test7.3 | 2 +- testsuite/expect/test7.7 | 2 +- testsuite/expect/test7.7.prog.c | 94 +- testsuite/expect/test7.8 | 2 +- 58 files changed, 1300 insertions(+), 939 deletions(-) diff --git a/NEWS b/NEWS index ad68f0d961c..d4c7f49d00f 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,13 @@ documents those changes that are of interest to users and admins. are created in SLURM tables for future use without a reboot of the SLURM daemons, but are not reported by any SLURM commands or APIs. +* Changes in SLURM 1.3.9 +======================== + -- Fix jobs being cancelled by ctrl-C to have correct cancelled state in + accounting. + -- Slurmdbd will only cache user data, made for faster start up + -- Improved support for job steps in FRONT_END systems + * Changes in SLURM 1.3.8 ======================== -- Added PrivateData flags for Users, Usage, and Accounts to Accounting. @@ -56,6 +63,7 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - Set MPI environment variables from salloc. -- BLUEGENE - Fix threading issue for overlap mode -- Reject batch scripts containing DOS linebreaks. + -- BLUEGENE - Added wait for block boot to salloc * Changes in SLURM 1.3.7 ======================== @@ -512,6 +520,8 @@ documents those changes that are of interest to users and admins. configuration or other runtime checks. -- Add "include" keywork to SPANK plugstack.conf file to optionally include other configuration files or directories of configuration files. + -- Srun to wait indefinitely for resource allocation to be made. Used to + abort after two minutes. * Changes in SLURM 1.2.34 ========================= diff --git a/configure b/configure index 00ef5f35adf..c052058b64a 100755 --- a/configure +++ b/configure @@ -25271,7 +25271,7 @@ echo "$as_me: WARNING: *** pkg-config not found. Cannot probe for libglade-2.0 o # fi -### Check for gtk2.8 package +### Check for gtk2.7.1 package if test "$ac_have_gtk" == "yes" ; then $HAVEPKGCONFIG --exists gtk+-2.0 if ! test $? -eq 0 ; then diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index b9711d0b1be..6ca32bfc0aa 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -890,7 +890,12 @@ about these options. clocks on the cluster?</b></a><br> In general, yes. Having inconsistent clocks may cause nodes to be unusable. SLURM log files should contain references to -expired credentials. +expired credentials. For example: +<pre> +error: Munge decode failed: Expired credential +ENCODED: Wed May 12 12:34:56 2008 +DECODED: Wed May 12 12:01:12 2008 +</pre> <p><a name="cred_invalid"><b>21. Why are "Invalid job credential" errors generated?</b></a><br> @@ -1001,6 +1006,6 @@ sinfo -t drain -h -o "scontrol update nodename='%N' state=drain reason='%E'" <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 2 September 2008</p> +<p style="text-align:center;">Last modified 3 September 2008</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index f82230ee631..df6f0fdfca3 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -29,6 +29,7 @@ and a host of others. <li>Chuck Clouston (Bull)</li> <li>Chris Dunlap (LLNL)</li> <li>Joey Ekstrom (LLNL/Bringham Young University)</li> +<li>Josh England (TGSMC)</li> <li>Kent Engström (National Supercomputer Centre, Sweden)</li> <li>Jim Garlick (LLNL)</li> <li>Didier Gazen (Laboratoire d'Aerologie, France)</li> @@ -59,11 +60,12 @@ Networking, Italy)</li> <li>Federico Sacerdoti (D.E. Shaw)<li> <li>Jeff Squyres (LAM MPI)</li> <li>Prashanth Tamraparni (HP, India)</li> +<li>Adam Todorski (Rensselaer Polytechnic Institute)</li <li>Kevin Tew (LLNL/Bringham Young University)</li> <li>Jay Windley (Linux NetworX)</li> <li>Anne-Marie Wunderlin (Bull)</li> </ul> -<p style="text-align:center;">Last modified 28 July 2008</p> +<p style="text-align:center;">Last modified 5 September 2008</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index dce6e249340..1c961e2830b 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1661,11 +1661,19 @@ is very important. \fBSelectTypeParameters\fR should be configured to treat memory as a consumable resource and the \fB\-\-mem\fR option should be used for job allocations. -For more information see the following web page: -\fIhttps://computing.llnl.gov/linux/slurm/cons_res_share.html\fR. -.na Possible values for \fBShared\fR are "EXCLUSIVE", "FORCE", "YES", and "NO". -.ad +Sharing of resources is typically useful only when using +\fBSchedulerType=sched/gang\fR. +For more information see the following web pages: +.br +\fIhttps://computing.llnl.gov/linux/slurm/cons_res.html\fR, +.br +\fIhttps://computing.llnl.gov/linux/slurm/cons_res_share.html\fR, +.br +\fIhttps://computing.llnl.gov/linux/slurm/gang_scheduling.html\fR, and +.br +\fIhttps://computing.llnl.gov/linux/slurm/preempt.html\fR. + .RS .TP 12 \fBEXCLUSIVE\fR diff --git a/etc/init.d.slurmdbd b/etc/init.d.slurmdbd index 55473e4383d..45e4471e8e7 100755 --- a/etc/init.d.slurmdbd +++ b/etc/init.d.slurmdbd @@ -44,6 +44,12 @@ else RETVAL=0 fi +# We can not use a starter program without losing environment +# variables that are critical on Blue Gene systems +if [ -d /bgl/BlueLight/ppcfloor ]; then + STARTPROC="" +fi + # Source slurm specific configuration if [ -f /etc/sysconfig/slurm ] ; then . /etc/sysconfig/slurm @@ -53,18 +59,23 @@ fi [ -f $CONFDIR/slurmdbd.conf ] || exit 1 +# setup library paths for slurm and munge support +export LD_LIBRARY_PATH="$LIBDIR:$LD_LIBRARY_PATH" + start() { - echo -n "starting slurmdbd: " + prog=$1 + shift + echo -n "starting $prog: " unset HOME MAIL USER USERNAME - $STARTPROC $SBINDIR/slurmdbd $SLURMDBD_OPTIONS + $STARTPROC $SBINDIR/$prog $SLURMDBD_OPTIONS rc_status -v echo touch /var/lock/subsys/slurmdbd } stop() { - echo -n "stopping slurmdbd: " - killproc slurmdbd -TERM + echo -n "stopping $1: " + killproc $1 -TERM rc_status -v echo rm -f /var/lock/subsys/slurmdbd @@ -76,7 +87,7 @@ slurmstatus() { local rpid local pidfile - pidfile=`grep -i PidFile $CONFDIR/slurmdbd.conf | grep -v '^ *#'` + pidfile=`grep -i ${base}pid $CONFDIR/slurmdbd.conf | grep -v '^ *#'` if [ $? = 0 ]; then pidfile=${pidfile##*=} pidfile=${pidfile%#*} @@ -84,25 +95,26 @@ slurmstatus() { pidfile=/var/run/slurmdbd.pid fi - pid=`pidof -o $$ -o $$PPID -o %PPID -x slurmdbd` + pid=`pidof -o $$ -o $$PPID -o %PPID -x $1 || \ + pidof -o $$ -o $$PPID -o %PPID -x ${base}` if [ -f $pidfile ]; then read rpid < $pidfile if [ "$rpid" != "" -a "$pid" != "" ]; then for i in $pid ; do if [ "$i" = "$rpid" ]; then - echo $"slurmdbd (pid $pid) is running..." + echo $"${base} (pid $pid) is running..." return 0 fi done elif [ "$rpid" != "" -a "$pid" = "" ]; then - echo $"slurmdbd is stopped" + echo $"${base} is stopped" return 1 fi fi - echo $"slurmdbd is stopped" + echo $"${base} is stopped" return 3 } @@ -125,8 +137,8 @@ case "$1" in slurmstatus slurmdbd ;; restart) - stop slurmdbd - start slurmdbd + $0 stop + $0 start ;; condrestart) if [ -f /var/lock/subsys/slurm ]; then diff --git a/src/api/allocate.c b/src/api/allocate.c index 41f7a96caa3..d87842f5e2a 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -262,7 +262,7 @@ slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, timeout); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ - if (resp == NULL) { + if (resp == NULL && errno != ESLURM_ALREADY_DONE) { errnum = errno; slurm_complete_job(job_id, -1); } diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 17f09dacf01..3dee13d3113 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -202,7 +202,7 @@ static int _get_local_user_list(void *db_conn, int enforce) while((user = list_next(itr))) { uid_t pw_uid = uid_from_string(user->name); if(pw_uid == (uid_t) -1) { - error("couldn't get a uid for user %s", + debug("couldn't get a uid for user %s", user->name); user->uid = (uint32_t)NO_VAL; } else @@ -218,26 +218,35 @@ static int _get_local_user_list(void *db_conn, int enforce) extern int assoc_mgr_init(void *db_conn, assoc_init_args_t *args) { - int enforce = 0; + uint16_t enforce = 0; + uint16_t refresh = 0; + uint16_t cache_level = ASSOC_MGR_CACHE_ALL; if(args) { enforce = args->enforce; if(args->remove_assoc_notify) remove_assoc_notify = args->remove_assoc_notify; + refresh = args->refresh; + cache_level = args->cache_level; } - if(!local_cluster_name && !slurmdbd_conf) + if((!local_cluster_name || refresh) && !slurmdbd_conf) { + xfree(local_cluster_name); local_cluster_name = slurm_get_cluster_name(); + } - if(!local_association_list) + if((!local_association_list || refresh) + && (cache_level & ASSOC_MGR_CACHE_ASSOC)) if(_get_local_association_list(db_conn, enforce) == SLURM_ERROR) return SLURM_ERROR; - - if(!local_qos_list) + + if((!local_qos_list || refresh) + && (cache_level & ASSOC_MGR_CACHE_QOS)) if(_get_local_qos_list(db_conn, enforce) == SLURM_ERROR) return SLURM_ERROR; - - if(!local_user_list) + + if((!local_user_list || refresh) + && (cache_level & ASSOC_MGR_CACHE_USER)) if(_get_local_user_list(db_conn, enforce) == SLURM_ERROR) return SLURM_ERROR; @@ -709,7 +718,7 @@ extern int assoc_mgr_update_local_users(acct_update_object_t *update) } pw_uid = uid_from_string(object->name); if(pw_uid == (uid_t) -1) { - error("couldn't get a uid for user %s", + debug("couldn't get a uid for user %s", object->name); object->uid = NO_VAL; } else diff --git a/src/common/assoc_mgr.h b/src/common/assoc_mgr.h index df9b26f9b76..2ca1750013b 100644 --- a/src/common/assoc_mgr.h +++ b/src/common/assoc_mgr.h @@ -49,8 +49,15 @@ #include <slurm/slurm.h> #include <slurm/slurm_errno.h> +#define ASSOC_MGR_CACHE_ASSOC 0x0001 +#define ASSOC_MGR_CACHE_QOS 0x0002 +#define ASSOC_MGR_CACHE_USER 0x0004 +#define ASSOC_MGR_CACHE_ALL 0xffff + typedef struct { - int enforce; + uint16_t cache_level; + uint16_t enforce; + uint16_t refresh; void (*remove_assoc_notify) (acct_association_rec_t *rec); } assoc_init_args_t; diff --git a/src/common/read_config.c b/src/common/read_config.c index 58ad7042c37..dbe52700f4e 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -810,6 +810,7 @@ static int _register_conf_node_aliases(slurm_conf_node_t *node_ptr) "in FRONT_END mode"); goto cleanup; } + hostname = node_ptr->hostnames; address = node_ptr->addresses; #else @@ -825,15 +826,20 @@ static int _register_conf_node_aliases(slurm_conf_node_t *node_ptr) #endif /* now build the individual node structures */ +#ifdef HAVE_FRONT_END + /* we always want the first on in the list to be the one + * returned when looking for localhost + */ + while ((alias = hostlist_pop(alias_list))) { +#else while ((alias = hostlist_shift(alias_list))) { -#ifndef HAVE_FRONT_END hostname = hostlist_shift(hostname_list); address = hostlist_shift(address_list); #endif _push_to_hashtbls(alias, hostname, address, node_ptr->port, - node_ptr->cpus, node_ptr->sockets, - node_ptr->cores, node_ptr->threads); + node_ptr->cpus, node_ptr->sockets, + node_ptr->cores, node_ptr->threads); free(alias); #ifndef HAVE_FRONT_END diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c index a37ad84ee4d..136d9d694f9 100644 --- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c +++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c @@ -668,6 +668,7 @@ extern int jobacct_storage_p_step_complete(void *db_conn, float ave_vsize = 0, ave_rss = 0, ave_pages = 0; float ave_cpu = 0, ave_cpu2 = 0; char *account; + uint32_t exit_code; if(!storage_init) { debug("jobacct init was not called or it failed"); @@ -684,7 +685,12 @@ extern int jobacct_storage_p_step_complete(void *db_conn, if ((elapsed=now-step_ptr->start_time)<0) elapsed=0; /* For *very* short jobs, if clock is wrong */ - if (step_ptr->exit_code) + + exit_code = step_ptr->exit_code; + if (exit_code == NO_VAL) { + comp_status = JOB_CANCELLED; + exit_code = 0; + } else if (exit_code) comp_status = JOB_FAILED; else comp_status = JOB_COMPLETE; @@ -740,7 +746,7 @@ extern int jobacct_storage_p_step_complete(void *db_conn, JOB_STEP, step_ptr->step_id, /* stepid */ comp_status, /* completion status */ - step_ptr->exit_code, /* completion code */ + exit_code, /* completion code */ cpus, /* number of tasks */ cpus, /* number of cpus */ elapsed, /* elapsed seconds */ diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index d77c5ea2f37..44817f3cf45 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -2011,7 +2011,8 @@ extern int acct_storage_p_add_coord(mysql_conn_t *mysql_conn, uint32_t uid, "insert into %s " "(timestamp, action, name, " "actor, info) " - "values (%d, %u, '%s', '%s', '%s')", + "values (%d, %u, '%s', " + "'%s', \"%s\")", txn_table, now, DBD_ADD_ACCOUNT_COORDS, user, user_name, acct); @@ -6705,10 +6706,11 @@ extern int clusteracct_storage_p_node_down(mysql_conn_t *mysql_conn, xstrfmtcat(query, "insert into %s " "(node_name, cluster, cpu_count, period_start, reason) " - "values ('%s', '%s', %u, %d, '%s') on duplicate key " + "values ('%s', '%s', %u, %d, \"%s\") on duplicate key " "update period_end=0;", event_table, node_ptr->name, cluster, cpus, event_time, my_reason); + debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); rc = mysql_db_query(mysql_conn->db_conn, query); xfree(query); @@ -6733,6 +6735,7 @@ extern int clusteracct_storage_p_node_up(mysql_conn_t *mysql_conn, "update %s set period_end=%d where cluster='%s' " "and period_end=0 and node_name='%s';", event_table, event_time, cluster, node_ptr->name); + debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); rc = mysql_db_query(mysql_conn->db_conn, query); xfree(query); return rc; @@ -6964,7 +6967,7 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, { #ifdef HAVE_MYSQL int rc=SLURM_SUCCESS; - char *jname, *nodes; + char *jname = NULL, *nodes = NULL; long priority; int track_steps = 0; char *block_id = NULL; @@ -7001,7 +7004,7 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else - nodes = "(null)"; + nodes = "None assigned"; if(job_ptr->batch_flag) track_steps = 1; @@ -7026,27 +7029,52 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, if(!job_ptr->db_index) { query = xstrdup_printf( "insert into %s " - "(jobid, account, associd, uid, gid, partition, " - "blockid, eligible, submit, start, name, track_steps, " - "state, priority, req_cpus, alloc_cpus, nodelist) " - "values (%u, '%s', %u, %u, %u, '%s', '%s', " - "%d, %d, %d, '%s', %u, " - "%u, %u, %u, %u, '%s') " - "on duplicate key update id=LAST_INSERT_ID(id), " - "end=0, state=%u", - job_table, job_ptr->job_id, job_ptr->account, - job_ptr->assoc_id, - job_ptr->user_id, job_ptr->group_id, - job_ptr->partition, block_id, - (int)job_ptr->details->begin_time, - (int)job_ptr->details->submit_time, - (int)job_ptr->start_time, - jname, track_steps, - job_ptr->job_state & (~JOB_COMPLETING), - priority, job_ptr->num_procs, - job_ptr->total_procs, nodes, - job_ptr->job_state & (~JOB_COMPLETING)); - + "(jobid, associd, uid, gid, nodelist, ", + job_table); + + if(job_ptr->account) + xstrcat(query, "account, "); + if(job_ptr->partition) + xstrcat(query, "partition, "); + if(block_id) + xstrcat(query, "blockid, "); + + xstrfmtcat(query, + "eligible, submit, start, name, track_steps, " + "state, priority, req_cpus, alloc_cpus) " + "values (%u, %u, %u, %u, '%s', ", + job_ptr->job_id, job_ptr->assoc_id, + job_ptr->user_id, job_ptr->group_id, nodes); + + if(job_ptr->account) + xstrfmtcat(query, "'%s', ", job_ptr->account); + if(job_ptr->partition) + xstrfmtcat(query, "'%s', ", job_ptr->partition); + if(block_id) + xstrfmtcat(query, "'%s', ", block_id); + + xstrfmtcat(query, + "%d, %d, %d, '%s', %u, %u, %u, %u, %u) " + "on duplicate key update " + "id=LAST_INSERT_ID(id), state=%u, associd=%u", + (int)job_ptr->details->begin_time, + (int)job_ptr->details->submit_time, + (int)job_ptr->start_time, + jname, track_steps, + job_ptr->job_state & (~JOB_COMPLETING), + priority, job_ptr->num_procs, + job_ptr->total_procs, + job_ptr->job_state & (~JOB_COMPLETING), + job_ptr->assoc_id); + + if(job_ptr->account) + xstrfmtcat(query, ", account='%s'", job_ptr->account); + if(job_ptr->partition) + xstrfmtcat(query, ", partition='%s'", + job_ptr->partition); + if(block_id) + xstrfmtcat(query, ", blockid='%s'", block_id); + debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); try_again: if(!(job_ptr->db_index = mysql_insert_ret_id( @@ -7065,16 +7093,25 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, rc = SLURM_ERROR; } } else { - query = xstrdup_printf( - "update %s set partition='%s', blockid='%s', start=%d, " - "name='%s', state=%u, alloc_cpus=%u, nodelist='%s', " - "account='%s', end=0 where id=%d", - job_table, job_ptr->partition, block_id, - (int)job_ptr->start_time, - jname, - job_ptr->job_state & (~JOB_COMPLETING), - job_ptr->total_procs, nodes, - job_ptr->account, job_ptr->db_index); + query = xstrdup_printf("update %s set nodelist='%s', ", + job_table, nodes); + + if(job_ptr->account) + xstrfmtcat(query, "account='%s', ", + job_ptr->account); + if(job_ptr->partition) + xstrfmtcat(query, "partition='%s', ", + job_ptr->partition); + if(block_id) + xstrfmtcat(query, "blockid='%s', ", block_id); + + xstrfmtcat(query, "start=%d, name='%s', state=%u, " + "alloc_cpus=%u, associd=%u where id=%d", + (int)job_ptr->start_time, + jname, job_ptr->job_state & (~JOB_COMPLETING), + job_ptr->total_procs, nodes, + job_ptr->assoc_id, + job_ptr->db_index); debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); rc = mysql_db_query(mysql_conn->db_conn, query); } @@ -7099,7 +7136,7 @@ extern int jobacct_storage_p_job_complete(mysql_conn_t *mysql_conn, #ifdef HAVE_MYSQL char *query = NULL, *nodes = NULL; int rc=SLURM_SUCCESS; - + if (!job_ptr->db_index && (!job_ptr->details || !job_ptr->details->submit_time)) { error("jobacct_storage_p_job_complete: " @@ -7110,15 +7147,19 @@ extern int jobacct_storage_p_job_complete(mysql_conn_t *mysql_conn, if(_check_connection(mysql_conn) != SLURM_SUCCESS) return SLURM_ERROR; debug2("mysql_jobacct_job_complete() called"); + + /* If we get an error with this just fall through to avoid an + * infinite loop + */ if (job_ptr->end_time == 0) { debug("mysql_jobacct: job %u never started", job_ptr->job_id); - return SLURM_ERROR; + return SLURM_SUCCESS; } if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else - nodes = "(null)"; + nodes = "None assigned"; if(!job_ptr->db_index) { if(!(job_ptr->db_index = @@ -7135,7 +7176,6 @@ extern int jobacct_storage_p_job_complete(mysql_conn_t *mysql_conn, job_ptr->job_id); return SLURM_SUCCESS; } - jobacct_storage_p_job_start(mysql_conn, job_ptr); } } @@ -7272,7 +7312,8 @@ extern int jobacct_storage_p_step_complete(mysql_conn_t *mysql_conn, float ave_cpu = 0, ave_cpu2 = 0; char *query = NULL; int rc =SLURM_SUCCESS; - + uint32_t exit_code = 0; + if (!step_ptr->job_ptr->db_index && (!step_ptr->job_ptr->details || !step_ptr->job_ptr->details->submit_time)) { @@ -7309,7 +7350,12 @@ extern int jobacct_storage_p_step_complete(mysql_conn_t *mysql_conn, if ((elapsed=now-step_ptr->start_time)<0) elapsed=0; /* For *very* short jobs, if clock is wrong */ - if (step_ptr->exit_code) + + exit_code = step_ptr->exit_code; + if (exit_code == NO_VAL) { + comp_status = JOB_CANCELLED; + exit_code = 0; + } else if (exit_code) comp_status = JOB_FAILED; else comp_status = JOB_COMPLETE; @@ -7369,7 +7415,7 @@ extern int jobacct_storage_p_step_complete(mysql_conn_t *mysql_conn, step_table, (int)now, comp_status, step_ptr->job_ptr->requid, - step_ptr->exit_code, + exit_code, /* user seconds */ jobacct->user_cpu_sec, /* user microseconds */ diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c index 318daf6dc28..ace4c6f4cd0 100644 --- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c @@ -500,6 +500,7 @@ no_cond: if(row[JOB_REQ_ACCOUNT]) job->account = xstrdup(row[JOB_REQ_ACCOUNT]); + if(row[JOB_REQ_BLOCKID]) job->blockid = xstrdup(row[JOB_REQ_BLOCKID]); @@ -569,11 +570,15 @@ no_cond: } } else { job->suspended = atoi(row[JOB_REQ_SUSPENDED]); - if(!job->end) { + + if(!job->start) { + job->elapsed = 0; + } else if(!job->end) { job->elapsed = now - job->start; } else { job->elapsed = job->end - job->start; } + job->elapsed -= job->suspended; } @@ -581,8 +586,13 @@ no_cond: job->jobname = xstrdup(row[JOB_REQ_NAME]); job->gid = atoi(row[JOB_REQ_GID]); job->exitcode = atoi(row[JOB_REQ_COMP_CODE]); - job->partition = xstrdup(row[JOB_REQ_PARTITION]); - job->nodes = xstrdup(row[JOB_REQ_NODELIST]); + + if(row[JOB_REQ_PARTITION]) + job->partition = xstrdup(row[JOB_REQ_PARTITION]); + + if(row[JOB_REQ_NODELIST]) + job->nodes = xstrdup(row[JOB_REQ_NODELIST]); + if (!job->nodes || !strcmp(job->nodes, "(null)")) { xfree(job->nodes); job->nodes = xstrdup("(unknown)"); diff --git a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c index 26c6e05350d..41d4a280848 100644 --- a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c +++ b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c @@ -1143,7 +1143,7 @@ extern int jobacct_storage_p_job_start(PGconn *acct_pgsql_db, if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else - nodes = "(null)"; + nodes = "None assigned"; if(job_ptr->batch_flag) track_steps = 1; @@ -1250,15 +1250,24 @@ extern int jobacct_storage_p_job_complete(PGconn *acct_pgsql_db, if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else - nodes = "(null)"; - + nodes = "None assigned"; + if(!job_ptr->db_index) { - job_ptr->db_index = _get_db_index(acct_pgsql_db, - job_ptr->details->submit_time, - job_ptr->job_id, - job_ptr->assoc_id); - if(job_ptr->db_index == -1) - return SLURM_ERROR; + if(!(job_ptr->db_index = + _get_db_index(acct_pgsql_db, + job_ptr->details->submit_time, + job_ptr->job_id, + job_ptr->assoc_id))) { + /* If we get an error with this just fall + * through to avoid an infinite loop + */ + if(jobacct_storage_p_job_start(acct_pgsql_db, job_ptr) + == SLURM_ERROR) { + error("couldn't add job %u at job completion", + job_ptr->job_id); + return SLURM_SUCCESS; + } + } } query = xstrdup_printf("update %s set start=%u, endtime=%u, state=%d, " "nodelist='%s', comp_code=%u, " @@ -1385,7 +1394,8 @@ extern int jobacct_storage_p_step_complete(PGconn *acct_pgsql_db, float ave_cpu = 0, ave_cpu2 = 0; char *query = NULL; int rc =SLURM_SUCCESS; - + uint32_t exit_code; + if (!step_ptr->job_ptr->db_index && (!step_ptr->job_ptr->details || !step_ptr->job_ptr->details->submit_time)) { @@ -1425,7 +1435,12 @@ extern int jobacct_storage_p_step_complete(PGconn *acct_pgsql_db, if ((elapsed=now-step_ptr->start_time)<0) elapsed=0; /* For *very* short jobs, if clock is wrong */ - if (step_ptr->exit_code) + + exit_code = step_ptr->exit_code; + if (exit_code == NO_VAL) { + comp_status = JOB_CANCELLED; + exit_code = 0; + } else if (exit_code) comp_status = JOB_FAILED; else comp_status = JOB_COMPLETE; @@ -1475,7 +1490,7 @@ extern int jobacct_storage_p_step_complete(PGconn *acct_pgsql_db, step_table, (int)now, comp_status, step_ptr->job_ptr->requid, - step_ptr->exit_code, + exit_code, /* user seconds */ jobacct->user_cpu_sec, /* user microseconds */ diff --git a/src/plugins/sched/wiki/job_modify.c b/src/plugins/sched/wiki/job_modify.c index 4ac123c9ca7..7cad50eadc1 100644 --- a/src/plugins/sched/wiki/job_modify.c +++ b/src/plugins/sched/wiki/job_modify.c @@ -95,11 +95,10 @@ static int _job_modify(uint32_t jobid, char *bank_ptr, old_time) * 60); last_job_update = time(NULL); } - if (bank_ptr) { - info("wiki: change job %u bank %s", jobid, bank_ptr); - xfree(job_ptr->account); - job_ptr->account = xstrdup(bank_ptr); - last_job_update = time(NULL); + + if (bank_ptr && + (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { + return EINVAL; } if (new_hostlist) { diff --git a/src/plugins/sched/wiki2/job_modify.c b/src/plugins/sched/wiki2/job_modify.c index 9e5f7aec178..714a38976b9 100644 --- a/src/plugins/sched/wiki2/job_modify.c +++ b/src/plugins/sched/wiki2/job_modify.c @@ -99,11 +99,9 @@ static int _job_modify(uint32_t jobid, char *bank_ptr, last_job_update = now; } - if (bank_ptr) { - info("wiki: change job %u bank %s", jobid, bank_ptr); - xfree(job_ptr->account); - job_ptr->account = xstrdup(bank_ptr); - last_job_update = now; + if (bank_ptr && + (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { + return EINVAL; } if (feature_ptr) { diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index 6001c2ac07f..c55a91280fa 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -96,6 +96,11 @@ s_p_options_t bg_conf_file_options[] = { {NULL} }; +typedef enum { + BLOCK_ALGO_FIRST, + BLOCK_ALGO_SECOND +} block_algo_t; + #ifdef HAVE_BG /** internal helper functions */ #ifdef HAVE_BG_FILES @@ -153,7 +158,7 @@ static void _delete_path_list(void *object); static int _find_match(ba_request_t* ba_request, List results); /** */ -static bool _node_used(ba_node_t* ba_node, int *geometry); +static bool _node_used(ba_node_t* ba_node, int x_size); /* */ static void _switch_config(ba_node_t* source, ba_node_t* target, int dim, @@ -167,14 +172,9 @@ static int _set_external_wires(int dim, int count, ba_node_t* source, static char *_set_internal_wires(List nodes, int size, int conn_type); /* */ -static int _find_x_path(List results, ba_node_t *ba_node, - int *start, int *first, - int *geometry, int found, int conn_type); - -/* */ -static int _find_x_path2(List results, ba_node_t *ba_node, - int *start, int *first, - int *geometry, int found, int conn_type); +static int _find_x_path(List results, ba_node_t *ba_node, int *start, + int x_size, int found, int conn_type, + block_algo_t algo); /* */ static int _remove_node(List results, int *node_tar); @@ -190,7 +190,7 @@ static int _find_next_free_using_port_2(ba_switch_t *curr_switch, /* int count, int highest_phys_x); */ /* */ static int _finish_torus(ba_switch_t *curr_switch, int source_port, - List nodes, int dim, int count, int *start); + int dim, int count, int *start); /* */ static int *_set_best_path(); @@ -1432,6 +1432,7 @@ end_it: * IN start - where to start the allocation. * IN geometry - the requested geometry of the block. * IN conn_type - mesh, torus, or small. + * * RET char * - hostlist of midplanes results represent must be * xfreed. NULL on failure */ @@ -1450,24 +1451,22 @@ extern char *set_bg_block(List results, int *start, || start[Y]>=DIM_SIZE[Y] || start[Z]>=DIM_SIZE[Z]) return NULL; - if(geometry[X]<=0 - || geometry[Y]<=0 - || geometry[Z]<=0) { + + if(geometry[X] <= 0 || geometry[Y] <= 0 || geometry[Z] <= 0) { error("problem with geometry %c%c%c, needs to be at least 111", alpha_num[geometry[X]], alpha_num[geometry[Y]], alpha_num[geometry[Z]]); return NULL; } + size = geometry[X] * geometry[Y] * geometry[Z]; - ba_node = &ba_system_ptr-> - grid[start[X]][start[Y]][start[Z]]; + ba_node = &ba_system_ptr->grid[start[X]][start[Y]][start[Z]]; #else if(start[X]>=DIM_SIZE[X]) return NULL; size = geometry[X]; - ba_node = &ba_system_ptr-> - grid[start[X]]; + ba_node = &ba_system_ptr->grid[start[X]]; #endif @@ -1478,36 +1477,34 @@ extern char *set_bg_block(List results, int *start, results = list_create(NULL); else send_results = 1; - + /* This midplane should have already been checked if it was in + use or not */ list_append(results, ba_node); if(conn_type == SELECT_SMALL) { /* adding the ba_node and ending */ ba_node->used = true; - name = xmalloc(4); - snprintf(name, 4, "%c%c%c", - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]]); + name = xstrdup_printf("%c%c%c", + alpha_num[ba_node->coord[X]], + alpha_num[ba_node->coord[Y]], + alpha_num[ba_node->coord[Z]]); goto end_it; } found = _find_x_path(results, ba_node, ba_node->coord, - ba_node->coord, - geometry, + geometry[X], 1, - conn_type); + conn_type, BLOCK_ALGO_FIRST); if(!found) { debug2("trying less efficient code"); remove_block(results, color_count); list_delete_all(results, &empty_null_destroy_list, ""); list_append(results, ba_node); - found = _find_x_path2(results, ba_node, - ba_node->coord, - ba_node->coord, - geometry, - 1, - conn_type); + found = _find_x_path(results, ba_node, + ba_node->coord, + geometry[X], + 1, + conn_type, BLOCK_ALGO_SECOND); } if(found) { #ifdef HAVE_BG @@ -2709,10 +2706,25 @@ static int _append_geo(int *geometry, List geos, int rotate) } /* + * Fill in the paths and extra midplanes we need for the block. + * Basically copy the x path sent in with the start_list in each Y anx + * Z dimension filling in every midplane for the block and then + * completing the Y and Z wiring, tying the whole block together. * + * IN/OUT results - total list of midplanes after this function + * returns successfully. Should be + * an exact copy of the start_list at first. + * IN start_list - exact copy of results at first, This should only be + * a list of midplanes on the X dim. We will work off this and + * the geometry to fill in this wiring for the X dim in all the + * Y and Z coords. + * IN geometry - What the block looks like + * IN conn_type - Mesh or Torus + * + * RET: 0 on failure 1 on success */ static int _fill_in_coords(List results, List start_list, - int *geometry, int conn_type) + int *geometry, int conn_type) { ba_node_t *ba_node = NULL; ba_node_t *check_node = NULL; @@ -2722,8 +2734,9 @@ static int _fill_in_coords(List results, List start_list, ba_switch_t *curr_switch = NULL; ba_switch_t *next_switch = NULL; - if(!start_list) + if(!start_list || !results) return 0; + /* go through the start_list and add all the midplanes */ itr = list_iterator_create(start_list); while((check_node = (ba_node_t*) list_next(itr))) { curr_switch = &check_node->axis_switch[X]; @@ -2744,18 +2757,23 @@ static int _fill_in_coords(List results, List start_list, [check_node->coord[X]] [check_node->coord[Y]+y] [check_node->coord[Z]+z]; - if(ba_node->coord[Y] - == check_node->coord[Y] - && ba_node->coord[Z] - == check_node->coord[Z]) + + if(ba_node->coord[Y] == check_node->coord[Y] + && ba_node->coord[Z] == check_node->coord[Z]) continue; - if (!_node_used(ba_node,geometry)) { + + if (!_node_used(ba_node, geometry[X])) { debug3("here Adding %c%c%c", alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], alpha_num[ba_node->coord[Z]]); list_append(results, ba_node); next_switch = &ba_node->axis_switch[X]; + + /* since we are going off the + * main system we can send NULL + * here + */ _copy_the_path(NULL, curr_switch, next_switch, 0, X); @@ -2788,6 +2806,24 @@ failed: return rc; } +/* + * Copy a path through the wiring of a switch to another switch on a + * starting port on a dimension. + * + * IN/OUT: nodes - Local list of midplanes you are keeping track of. If + * you visit any new midplanes a copy from ba_system_grid + * will be added to the list. If NULL the path will be + * set in mark_switch of the main virtual system (ba_system_grid). + * IN: curr_switch - The switch you want to copy the path of + * IN/OUT: mark_switch - The switch you want to fill in. On success + * this switch will contain a complete path from the curr_switch + * starting from the source port. + * IN: source - source port number (If calling for the first time + * should be 0 since we are looking for 1 at the end) + * IN: dim - Dimension XYZ + * + * RET: on success 1, on error 0 + */ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, ba_switch_t *mark_switch, int source, int dim) @@ -2798,7 +2834,8 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, int port_tar, port_tar1; ba_switch_t *next_switch = NULL; ba_switch_t *next_mark_switch = NULL; - /*set the switch to not be used */ + + /* Copy the source used and port_tar */ mark_switch->int_wire[source].used = curr_switch->int_wire[source].used; mark_switch->int_wire[source].port_tar = @@ -2806,6 +2843,7 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, port_tar = curr_switch->int_wire[source].port_tar; + /* Now to the same thing from the other end */ mark_switch->int_wire[port_tar].used = curr_switch->int_wire[port_tar].used; mark_switch->int_wire[port_tar].port_tar = @@ -2828,6 +2866,7 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, port_tar); if(port_tar == 1) { + /* found the end of the line */ mark_switch->int_wire[1].used = curr_switch->int_wire[1].used; mark_switch->int_wire[1].port_tar = @@ -2841,12 +2880,18 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, if(node_curr[X] == node_tar[X] && node_curr[Y] == node_tar[Y] && node_curr[Z] == node_tar[Z]) { + /* We are going to the same node! this should never + happen */ debug4("something bad happened!!"); return 0; } + + /* see what the next switch is going to be */ next_switch = &ba_system_ptr-> grid[node_tar[X]][node_tar[Y]][node_tar[Z]].axis_switch[dim]; if(!nodes) { + /* If no nodes then just get the next switch to fill + in from the main system */ next_mark_switch = &ba_system_ptr-> grid[mark_node_tar[X]] [mark_node_tar[Y]] @@ -2855,6 +2900,7 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, } else { ba_node_t *ba_node = NULL; ListIterator itr = list_iterator_create(nodes); + /* see if we have already been to this node */ while((ba_node = list_next(itr))) { if (ba_node->coord[X] == mark_node_tar[X] && ba_node->coord[Y] == mark_node_tar[Y] && @@ -2863,6 +2909,7 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, } list_iterator_destroy(itr); if(!ba_node) { + /* If node grab a copy and add it to the list */ ba_node = ba_copy_node(&ba_system_ptr-> grid[mark_node_tar[X]] [mark_node_tar[Y]] @@ -2877,8 +2924,10 @@ static int _copy_the_path(List nodes, ba_switch_t *curr_switch, next_mark_switch = &ba_node->axis_switch[dim]; } + + /* Keep going until we reach the end of the line */ return _copy_the_path(nodes, next_switch, next_mark_switch, - port_tar, dim); + port_tar, dim); } static int _find_yz_path(ba_node_t *ba_node, int *first, @@ -3350,7 +3399,7 @@ start_again: #endif ; - if (!_node_used(ba_node, ba_request->geometry)) { + if (!_node_used(ba_node, ba_request->geometry[X])) { debug3("trying this node %c%c%c %c%c%c %d", alpha_num[start[X]], alpha_num[start[Y]], @@ -3419,10 +3468,15 @@ requested_end: return 0; } -/* bool _node_used(ba_node_t* ba_node, int geometry, */ -static bool _node_used(ba_node_t* ba_node, int *geometry) +/* + * Used to check if midplane is usable in the block we are creating + * + * IN: ba_node - node to check if is used + * IN: x_size - How big is the block in the X dim used to see if the + * wires are full hence making this midplane unusable. + */ +static bool _node_used(ba_node_t* ba_node, int x_size) { - int i=0; ba_switch_t* ba_switch = NULL; /* if we've used this node in another block already */ @@ -3433,17 +3487,27 @@ static bool _node_used(ba_node_t* ba_node, int *geometry) alpha_num[ba_node->coord[Z]]); return true; } - /* if we've used this nodes switches completely in another - block already */ - for(i=0;i<1;i++) { - if(geometry[i]>1) { - ba_switch = &ba_node->axis_switch[i]; - - if(ba_switch->int_wire[3].used - && ba_switch->int_wire[5].used) { - debug3("switch in use dim %d!",i); - return true; - } + /* Check If we've used this node's switches completely in another + block already. Right now we are only needing to look at + the X dim since it is the only one with extra wires. This + can be set up to do all the dim's if in the future if it is + needed. We only need to check this if we are planning on + using more than 1 midplane in the block creation */ + if(x_size > 1) { + /* get the switch of the X Dimension */ + ba_switch = &ba_node->axis_switch[X]; + + /* If both of these ports are used then the node + is in use since there are no more wires we + can use since these can not connect to each + other they must be connected to the other ports. + */ + if(ba_switch->int_wire[3].used && ba_switch->int_wire[5].used) { + debug3("switch full in the X dim on node %c%c%c!", + alpha_num[ba_node->coord[X]], + alpha_num[ba_node->coord[Y]], + alpha_num[ba_node->coord[Z]]); + return true; } } @@ -3844,7 +3908,7 @@ static char *_set_internal_wires(List nodes, int size, int conn_type) name = xmalloc(BUFSIZE); hostlist = hostlist_create(NULL); itr = list_iterator_create(nodes); - while((ba_node[count] = (ba_node_t*) list_next(itr))) { + while((ba_node[count] = list_next(itr))) { snprintf(temp_name, sizeof(temp_name), "%c%c%c", alpha_num[ba_node[count]->coord[X]], alpha_num[ba_node[count]->coord[Y]], @@ -3894,41 +3958,67 @@ static char *_set_internal_wires(List nodes, int size, int conn_type) return name; } +/* + * Used to find a complete path based on the conn_type for an x dim. + * When starting to wire a block together this should be called first. + * + * IN/OUT: results - contains the number of midplanes we are + * potentially going to use in the X dim. + * IN: ba_node - current node we are looking at and have already added + * to results. + * IN: start - coordinates of the first midplane (so we know when when + * to end with a torus) + * IN: x_size - How many midplanes are we looking for in the X dim + * IN: found - count of how many midplanes we have found in the x dim + * IN: conn_type - MESH or TORUS + * IN: algo - algorythm to try an allocation by + * + * RET: 0 on failure, 1 on success + */ static int _find_x_path(List results, ba_node_t *ba_node, - int *start, int *first, int *geometry, - int found, int conn_type) + int *start, int x_size, + int found, int conn_type, block_algo_t algo) { ba_switch_t *curr_switch = NULL; ba_switch_t *next_switch = NULL; int port_tar = 0; int source_port=0; - int target_port=0; + int target_port=1; int broke = 0, not_first = 0; - int ports_to_try[2] = {3,5}; + int ports_to_try[2] = {4, 2}; int *node_tar = NULL; int i = 0; ba_node_t *next_node = NULL; ba_node_t *check_node = NULL; -/* int highest_phys_x = geometry[X] - start[X]; */ +/* int highest_phys_x = x_size - start[X]; */ /* info("highest_phys_x is %d", highest_phys_x); */ - ListIterator itr; + ListIterator itr = NULL; - if(!ba_node) + if(!ba_node || !results || !start) return 0; - if(!source_port) { - target_port=1; + /* we don't need to go any further */ + if(x_size == 1) + return 1; + + if(algo == BLOCK_ALGO_FIRST) { ports_to_try[0] = 4; ports_to_try[1] = 2; - - } + } else if(algo == BLOCK_ALGO_SECOND) { + ports_to_try[0] = 2; + ports_to_try[1] = 4; + } else { + error("Unknown algo %d", algo); + return 0; + } + curr_switch = &ba_node->axis_switch[X]; - if(geometry[X] == 1) { - goto found_one; - } - debug3("found - %d",found); + + debug3("Algo(%d) found - %d", algo, found); + + /* Check the 2 ports we can leave though in ports_to_try */ for(i=0;i<2;i++) { /* info("trying port %d", ports_to_try[i]); */ /* check to make sure it isn't used */ @@ -3950,53 +4040,48 @@ static int _find_x_path(List results, ba_node_t *ba_node, /* port_tar); */ /* check to see if we are back at the start of the block */ - if((node_tar[X] == - start[X] && - node_tar[Y] == - start[Y] && - node_tar[Z] == - start[Z])) { + if((node_tar[X] == start[X] + && node_tar[Y] == start[Y] + && node_tar[Z] == start[Z])) { broke = 1; goto broke_it; } /* check to see if the port points to itself */ - if((node_tar[X] == - ba_node->coord[X] && - node_tar[Y] == - ba_node->coord[Y] && - node_tar[Z] == - ba_node->coord[Z])) { + if((node_tar[X] == ba_node->coord[X] + && node_tar[Y] == ba_node->coord[Y] + && node_tar[Z] == ba_node->coord[Z])) { continue; } /* check to see if I am going to a place I have already been before */ itr = list_iterator_create(results); - while((next_node = (ba_node_t*) list_next(itr))) { - debug3("looking at %c%c%c and %c%c%c", + while((next_node = list_next(itr))) { + debug3("Algo(%d) looking at %c%c%c and %c%c%c", + algo, alpha_num[next_node->coord[X]], alpha_num[next_node->coord[Y]], alpha_num[next_node->coord[Z]], alpha_num[node_tar[X]], alpha_num[node_tar[Y]], alpha_num[node_tar[Z]]); - if((node_tar[X] == next_node->coord[X] && - node_tar[Y] == next_node->coord[Y] && - node_tar[Z] == next_node->coord[Z])) { + if((node_tar[X] == next_node->coord[X] + && node_tar[Y] == next_node->coord[Y] + && node_tar[Z] == next_node->coord[Z])) { not_first = 1; break; } } list_iterator_destroy(itr); - if(not_first && found<DIM_SIZE[X]) { - debug2("already been there before"); + if(not_first && found < DIM_SIZE[X]) { + debug2("Algo(%d) already been there before", + algo); not_first = 0; continue; } not_first = 0; broke_it: - next_node = &ba_system_ptr-> - grid[node_tar[X]] + next_node = &ba_system_ptr->grid[node_tar[X]] #ifdef HAVE_BG [node_tar[Y]] [node_tar[Z]] @@ -4004,97 +4089,36 @@ static int _find_x_path(List results, ba_node_t *ba_node, ; next_switch = &next_node->axis_switch[X]; - if((conn_type == SELECT_MESH) - && (found == (geometry[X]))) { - debug2("we found the end of the mesh"); + if((conn_type == SELECT_MESH) && (found == (x_size))) { + debug2("Algo(%d) we found the end of the mesh", + algo); return 1; } - debug3("Broke = %d Found = %d geometry[X] = %d", - broke, found, geometry[X]); - -/* This doesnt' appear to be of any use since we are doing a circular - * system not a linear one. Kept just to make sure. - */ + debug3("Algo(%d) Broke = %d Found = %d x_size = %d", + algo, broke, found, x_size); -/* debug3("Next Phys X %d Highest X %d", */ -/* next_node->phys_x, highest_phys_x); */ -/* if(next_node->phys_x >= highest_phys_x) { */ -/* debug3("looking for a passthrough"); */ -/* if(best_path) */ -/* list_destroy(best_path); */ -/* best_path = list_create(_delete_path_list); */ -/* if(path) */ -/* list_destroy(path); */ -/* path = list_create(_delete_path_list); */ - -/* _find_passthrough(curr_switch, */ -/* 0, */ -/* results, */ -/* X, */ -/* 0, */ -/* highest_phys_x); */ -/* if(best_count < BEST_COUNT_INIT) { */ -/* debug2("yes found next free %d", */ -/* best_count); */ -/* node_tar = _set_best_path(); */ -/* next_node = &ba_system_ptr-> */ -/* grid[node_tar[X]] */ -/* #ifdef HAVE_BG */ -/* [node_tar[Y]] */ -/* [node_tar[Z]] */ -/* #endif */ -/* ; */ -/* next_switch = */ -/* &next_node->axis_switch[X]; */ - -/* #ifdef HAVE_BG */ -/* debug2("found %d looking at " */ -/* "%c%c%c going to %c%c%c %d", */ -/* found, */ -/* alpha_num[ba_node->coord[X]], */ -/* alpha_num[ba_node->coord[Y]], */ -/* alpha_num[ba_node->coord[Z]], */ -/* alpha_num[node_tar[X]], */ -/* alpha_num[node_tar[Y]], */ -/* alpha_num[node_tar[Z]], */ -/* port_tar); */ -/* #endif */ -/* list_append(results, next_node); */ -/* found++; */ -/* if(_find_x_path(results, next_node, */ -/* start, first, geometry, */ -/* found, conn_type)) { */ -/* return 1; */ -/* } else { */ -/* found--; */ -/* _reset_the_path(curr_switch, 0, */ -/* 1, X); */ -/* _remove_node(results, */ -/* next_node->coord); */ -/* return 0; */ -/* } */ -/* } */ -/* } */ - - if(broke && (found == geometry[X])) { + if(broke && (found == x_size)) { goto found_path; - } else if(found == geometry[X]) { - debug2("finishing the torus!"); + } else if(found == x_size) { + debug2("Algo(%d) finishing the torus!", algo); + if(best_path) - list_destroy(best_path); - best_path = list_create(_delete_path_list); + list_flush(best_path); + else + best_path = + list_create(_delete_path_list); + if(path) - list_destroy(path); - path = list_create(_delete_path_list); - _finish_torus(curr_switch, - 0, - results, - X, - 0, - start); + list_flush(path); + else + path = list_create(_delete_path_list); + + _finish_torus(curr_switch, 0, X, 0, start); + if(best_count < BEST_COUNT_INIT) { - debug2("Found a best path with %d " - "steps.", best_count); + debug2("Algo(%d) Found a best path " + "with %d steps.", + algo, best_count); _set_best_path(); return 1; } else { @@ -4105,10 +4129,11 @@ static int _find_x_path(List results, ba_node_t *ba_node, continue; } - if (!_node_used(next_node, geometry)) { + if (!_node_used(next_node, x_size)) { #ifdef HAVE_BG - debug2("found %d looking at %c%c%c " + debug2("Algo(%d) found %d looking at %c%c%c " "%d going to %c%c%c %d", + algo, found, alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], @@ -4120,13 +4145,11 @@ static int _find_x_path(List results, ba_node_t *ba_node, port_tar); #endif itr = list_iterator_create(results); - while((check_node = - (ba_node_t*) list_next(itr))) { - if((node_tar[X] == - check_node->coord[X] && - node_tar[Y] == - check_node->coord[Y] && - node_tar[Z] == + while((check_node = list_next(itr))) { + if((node_tar[X] == check_node->coord[X] + && node_tar[Y] == + check_node->coord[Y] + && node_tar[Z] == check_node->coord[Z])) { break; } @@ -4134,7 +4157,8 @@ static int _find_x_path(List results, ba_node_t *ba_node, list_iterator_destroy(itr); if(!check_node) { #ifdef HAVE_BG - debug2("add %c%c%c", + debug2("Algo(%d) add %c%c%c", + algo, alpha_num[next_node->coord[X]], alpha_num[next_node->coord[Y]], alpha_num[next_node->coord[Z]]); @@ -4142,8 +4166,9 @@ static int _find_x_path(List results, ba_node_t *ba_node, list_append(results, next_node); } else { #ifdef HAVE_BG - debug2("Hey this is already added " - "%c%c%c", + debug2("Algo(%d) Hey this is already " + "added %c%c%c", + algo, alpha_num[node_tar[X]], alpha_num[node_tar[Y]], alpha_num[node_tar[Z]]); @@ -4151,19 +4176,20 @@ static int _find_x_path(List results, ba_node_t *ba_node, continue; } found++; - + + /* look for the next closest midplane */ if(!_find_x_path(results, next_node, - start, first, geometry, - found, conn_type)) { - _remove_node(results, - next_node->coord); + start, x_size, + found, conn_type, algo)) { + _remove_node(results, next_node->coord); found--; continue; } else { found_path: #ifdef HAVE_BG - debug2("added node %c%c%c %d %d -> " - "%c%c%c %d %d", + debug2("Algo(%d) added node %c%c%c " + "%d %d -> %c%c%c %d %d", + algo, alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], alpha_num[ba_node->coord[Z]], @@ -4175,341 +4201,104 @@ static int _find_x_path(List results, ba_node_t *ba_node, port_tar, target_port); #endif - found_one: - if(geometry[X] != 1) { - curr_switch-> - int_wire - [source_port].used = 1; - curr_switch-> - int_wire - [source_port].port_tar - = ports_to_try[i]; - curr_switch-> - int_wire - [ports_to_try[i]].used - = 1; - curr_switch-> - int_wire - [ports_to_try[i]]. - port_tar = source_port; + curr_switch->int_wire[source_port].used + = 1; + curr_switch->int_wire + [source_port].port_tar + = ports_to_try[i]; + curr_switch->int_wire + [ports_to_try[i]].used = 1; + curr_switch->int_wire + [ports_to_try[i]].port_tar + = source_port; - next_switch-> - int_wire[port_tar].used - = 1; - next_switch-> - int_wire - [port_tar].port_tar - = target_port; - next_switch-> - int_wire - [target_port].used = 1; - next_switch-> - int_wire - [target_port].port_tar - = port_tar; - } + next_switch->int_wire[port_tar].used + = 1; + next_switch->int_wire[port_tar].port_tar + = target_port; + next_switch->int_wire[target_port].used + = 1; + next_switch->int_wire + [target_port].port_tar + = port_tar; return 1; - } } } } - debug2("couldn't find path"); - return 0; -} - -static int _find_x_path2(List results, ba_node_t *ba_node, - int *start, int *first, int *geometry, - int found, int conn_type) -{ - ba_switch_t *curr_switch = NULL; - ba_switch_t *next_switch = NULL; - - int port_tar = 0; - int source_port=0; - int target_port=0; - int broke = 0, not_first = 0; - int ports_to_try[2] = {3,5}; - int *node_tar = NULL; - int i = 0; - ba_node_t *next_node = NULL; - ba_node_t *check_node = NULL; - - ListIterator itr; - - if(!ba_node) + if(algo == BLOCK_ALGO_FIRST) { + debug2("Algo(%d) couldn't find path", algo); return 0; - - if(!source_port) { - target_port=1; - ports_to_try[0] = 2; - ports_to_try[1] = 4; + } else if(algo == BLOCK_ALGO_SECOND) { +#ifdef HAVE_BG + debug2("Algo(%d) looking for the next free node " + "starting at %c%c%c", + algo, + alpha_num[ba_node->coord[X]], + alpha_num[ba_node->coord[Y]], + alpha_num[ba_node->coord[Z]]); +#endif + + if(best_path) + list_flush(best_path); + else + best_path = list_create(_delete_path_list); + + if(path) + list_flush(path); + else + path = list_create(_delete_path_list); + + _find_next_free_using_port_2(curr_switch, 0, results, X, 0); + + if(best_count < BEST_COUNT_INIT) { + debug2("Algo(%d) yes found next free %d", algo, + best_count); + node_tar = _set_best_path(); - } - curr_switch = &ba_node->axis_switch[X]; - if(geometry[X] == 1) { - goto found_one; - } - debug2("found - %d",found); - for(i=0;i<2;i++) { - /* check to make sure it isn't used */ - if(!curr_switch->int_wire[ports_to_try[i]].used) { - node_tar = curr_switch-> - ext_wire[ports_to_try[i]].node_tar; - port_tar = curr_switch-> - ext_wire[ports_to_try[i]].port_tar; - if((node_tar[X] == - start[X] && - node_tar[Y] == - start[Y] && - node_tar[Z] == - start[Z])) { - broke = 1; - goto broke_it; - } - if((node_tar[X] == - ba_node->coord[X] && - node_tar[Y] == - ba_node->coord[Y] && - node_tar[Z] == - ba_node->coord[Z])) { - continue; - } - itr = list_iterator_create(results); - while((next_node = (ba_node_t*) list_next(itr))) { - if((node_tar[X] == - next_node->coord[X] && - node_tar[Y] == - next_node->coord[Y] && - node_tar[Z] == - next_node->coord[Z])) { - not_first = 1; - break; - } - - } - list_iterator_destroy(itr); - if(not_first && found<DIM_SIZE[X]) { - not_first = 0; - continue; - } - not_first = 0; - - broke_it: - next_node = &ba_system_ptr-> - grid[node_tar[X]] + next_node = &ba_system_ptr->grid[node_tar[X]] #ifdef HAVE_BG [node_tar[Y]] [node_tar[Z]] #endif ; - + next_switch = &next_node->axis_switch[X]; - - if((conn_type == SELECT_MESH) - && (found == (geometry[X]))) { - debug2("we found the end of the mesh"); +#ifdef HAVE_BG + debug2("Algo(%d) found %d looking at %c%c%c " + "going to %c%c%c %d", + algo, found, + alpha_num[ba_node->coord[X]], + alpha_num[ba_node->coord[Y]], + alpha_num[ba_node->coord[Z]], + alpha_num[node_tar[X]], + alpha_num[node_tar[Y]], + alpha_num[node_tar[Z]], + port_tar); +#endif + list_append(results, next_node); + found++; + if(_find_x_path(results, next_node, + start, x_size, found, + conn_type, algo)) { return 1; + } else { + found--; + _reset_the_path(curr_switch, 0, 1, X); + _remove_node(results, next_node->coord); + debug2("Algo(%d) couldn't finish " + "the path off this one", algo); } - debug3("Broke = %d Found = %d geometry[X] = %d", - broke, found, geometry[X]); - if(broke && (found == geometry[X])) { - goto found_path; - } else if(found == geometry[X]) { - debug2("finishing the torus!"); - if(best_path) - list_destroy(best_path); - best_path = list_create(_delete_path_list); - if(path) - list_destroy(path); - path = list_create(_delete_path_list); - _finish_torus(curr_switch, - 0, - results, - X, - 0, - start); - if(best_count < BEST_COUNT_INIT) { - debug2("Found a best path with %d " - "steps.", best_count); - _set_best_path(); - return 1; - } else { - return 0; - } - } else if(broke) { - broke = 0; - continue; - } - - if (!_node_used(next_node, geometry)) { -#ifdef HAVE_BG - debug2("found %d looking at %c%c%c " - "%d going to %c%c%c %d", - found, - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]], - ports_to_try[i], - alpha_num[node_tar[X]], - alpha_num[node_tar[Y]], - alpha_num[node_tar[Z]], - port_tar); -#endif - itr = list_iterator_create(results); - while((check_node = - (ba_node_t*) list_next(itr))) { - if((node_tar[X] == - check_node->coord[X] && - node_tar[Y] == - check_node->coord[Y] && - node_tar[Z] == - check_node->coord[Z])) { - break; - } - } - list_iterator_destroy(itr); - if(!check_node) { -#ifdef HAVE_BG - debug2("add %c%c%c", - alpha_num[next_node->coord[X]], - alpha_num[next_node->coord[Y]], - alpha_num[next_node->coord[Z]]); -#endif - list_append(results, next_node); - } else { -#ifdef HAVE_BG - debug2("Hey this is already added " - "%c%c%c", - alpha_num[node_tar[X]], - alpha_num[node_tar[Y]], - alpha_num[node_tar[Z]]); -#endif - continue; - } - found++; - - if(!_find_x_path2(results, next_node, - start, first, geometry, - found, conn_type)) { - _remove_node(results, - next_node->coord); - found--; - continue; - } else { - found_path: -#ifdef HAVE_BG - debug2("added node %c%c%c %d %d -> " - "%c%c%c %d %d", - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]], - source_port, - ports_to_try[i], - alpha_num[node_tar[X]], - alpha_num[node_tar[Y]], - alpha_num[node_tar[Z]], - port_tar, - target_port); -#endif - found_one: - if(geometry[X] != 1) { - curr_switch-> - int_wire - [source_port].used = 1; - curr_switch-> - int_wire - [source_port].port_tar - = ports_to_try[i]; - curr_switch-> - int_wire - [ports_to_try[i]].used - = 1; - curr_switch-> - int_wire - [ports_to_try[i]]. - port_tar = source_port; - - next_switch-> - int_wire[port_tar].used - = 1; - next_switch-> - int_wire - [port_tar].port_tar - = target_port; - next_switch-> - int_wire - [target_port].used = 1; - next_switch-> - int_wire - [target_port].port_tar - = port_tar; - } - return 1; - } - } - } + } + + debug2("Algo(%d) couldn't find path", algo); + return 0; } -#ifdef HAVE_BG - debug2("looking for the next free node starting at %c%c%c", - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]]); -#endif - if(best_path) - list_destroy(best_path); - best_path = list_create(_delete_path_list); - if(path) - list_destroy(path); - path = list_create(_delete_path_list); - - _find_next_free_using_port_2(curr_switch, - 0, - results, - X, - 0); - if(best_count < BEST_COUNT_INIT) { - debug2("yes found next free %d", best_count); - node_tar = _set_best_path(); - - next_node = &ba_system_ptr-> - grid[node_tar[X]] -#ifdef HAVE_BG - [node_tar[Y]] - [node_tar[Z]] -#endif - ; - - next_switch = &next_node->axis_switch[X]; - -#ifdef HAVE_BG - debug2("found %d looking at %c%c%c going to %c%c%c %d", - found, - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]], - alpha_num[node_tar[X]], - alpha_num[node_tar[Y]], - alpha_num[node_tar[Z]], - port_tar); -#endif - list_append(results, next_node); - found++; - if(_find_x_path2(results, next_node, - start, first, geometry, found, conn_type)) { - return 1; - } else { - found--; - _reset_the_path(curr_switch, 0, 1, X); - _remove_node(results, next_node->coord); - debug2("couldn't finish the path off this one"); - } - } - - debug2("couldn't find path 2"); + error("We got here meaning there is a bad algo, " + "but this should never happen algo(%d)", algo); return 0; } @@ -4614,8 +4403,8 @@ static int _find_next_free_using_port_2(ba_switch_t *curr_switch, if((source_port%2)) target_port=1; - list_destroy(best_path); - best_path = list_create(_delete_path_list); + list_flush(best_path); + found = true; path_add->out = target_port; list_push(path, path_add); @@ -4634,7 +4423,7 @@ static int _find_next_free_using_port_2(ba_switch_t *curr_switch, temp_switch->dim = path_switch->dim; temp_switch->in = path_switch->in; temp_switch->out = path_switch->out; - list_append(best_path,temp_switch); + list_append(best_path, temp_switch); } list_iterator_destroy(itr); best_count = count; @@ -4711,205 +4500,28 @@ return_0: return 0; } -/* static int _find_passthrough(ba_switch_t *curr_switch, int source_port, */ -/* List nodes, int dim, int count, int highest_phys_x) */ -/* { */ -/* ba_switch_t *next_switch = NULL; */ -/* ba_path_switch_t *path_add = */ -/* (ba_path_switch_t *) xmalloc(sizeof(ba_path_switch_t)); */ -/* ba_path_switch_t *path_switch = NULL; */ -/* ba_path_switch_t *temp_switch = NULL; */ -/* int port_tar; */ -/* int target_port = 0; */ -/* int ports_to_try[2] = {3,5}; */ -/* int *node_tar= curr_switch->ext_wire[0].node_tar; */ -/* int *node_src = curr_switch->ext_wire[0].node_tar; */ -/* int i; */ -/* int used=0; */ -/* int broke = 0; */ -/* ba_node_t *ba_node = NULL; */ - -/* ListIterator itr; */ -/* static bool found = false; */ - -/* path_add->geometry[X] = node_src[X]; */ -/* #ifdef HAVE_BG */ -/* path_add->geometry[Y] = node_src[Y]; */ -/* path_add->geometry[Z] = node_src[Z]; */ -/* #endif */ -/* path_add->dim = dim; */ -/* path_add->in = source_port; */ - -/* if(count>=best_count) { */ -/* xfree(path_add); */ -/* return 0; */ -/* } */ - -/* itr = list_iterator_create(nodes); */ -/* while((ba_node = (ba_node_t*) list_next(itr))) { */ - -/* #ifdef HAVE_BG */ -/* if(node_tar[X] == ba_node->coord[X] */ -/* && node_tar[Y] == ba_node->coord[Y] */ -/* && node_tar[Z] == ba_node->coord[Z]) { */ -/* broke = 1; */ -/* break; */ -/* } */ -/* #else */ -/* if(node_tar[X] == ba_node->coord[X]) { */ -/* broke = 1; */ -/* break; */ -/* } */ -/* #endif */ - -/* } */ -/* list_iterator_destroy(itr); */ -/* ba_node = &ba_system_ptr-> */ -/* grid[node_tar[X]] */ -/* #ifdef HAVE_BG */ -/* [node_tar[Y]] */ -/* [node_tar[Z]] */ -/* #endif */ -/* ; */ -/* if(!broke && count>0 */ -/* && !ba_node->used */ -/* && (ba_node->phys_x < highest_phys_x)) { */ - -/* debug3("this one not found %c%c%c", */ -/* alpha_num[node_tar[X]], */ -/* alpha_num[node_tar[Y]], */ -/* alpha_num[node_tar[Z]]); */ - -/* broke = 0; */ - -/* if((source_port%2)) */ -/* target_port=1; */ - -/* list_destroy(best_path); */ -/* best_path = list_create(_delete_path_list); */ -/* found = true; */ -/* path_add->out = target_port; */ -/* list_push(path, path_add); */ - -/* itr = list_iterator_create(path); */ -/* while((path_switch = (ba_path_switch_t*) list_next(itr))){ */ - -/* temp_switch = (ba_path_switch_t *) */ -/* xmalloc(sizeof(ba_path_switch_t)); */ - -/* temp_switch->geometry[X] = path_switch->geometry[X]; */ -/* #ifdef HAVE_BG */ -/* temp_switch->geometry[Y] = path_switch->geometry[Y]; */ -/* temp_switch->geometry[Z] = path_switch->geometry[Z]; */ -/* #endif */ -/* temp_switch->dim = path_switch->dim; */ -/* temp_switch->in = path_switch->in; */ -/* temp_switch->out = path_switch->out; */ -/* list_append(best_path,temp_switch); */ -/* } */ -/* list_iterator_destroy(itr); */ -/* best_count = count; */ -/* return 1; */ -/* } */ - -/* if(source_port==0 || source_port==3 || source_port==5) { */ -/* if(count==0) { */ -/* ports_to_try[0] = 2; */ -/* ports_to_try[1] = 4; */ -/* } else { */ -/* ports_to_try[0] = 4; */ -/* ports_to_try[1] = 2; */ -/* } */ -/* } */ - -/* for(i=0;i<2;i++) { */ -/* used=0; */ -/* if(!curr_switch->int_wire[ports_to_try[i]].used) { */ -/* itr = list_iterator_create(path); */ -/* while((path_switch = */ -/* (ba_path_switch_t*) list_next(itr))){ */ - -/* if(((path_switch->geometry[X] == node_src[X]) */ -/* #ifdef HAVE_BG */ -/* && (path_switch->geometry[Y] */ -/* == node_src[Y]) */ -/* && (path_switch->geometry[Z] */ -/* == node_tar[Z]) */ -/* #endif */ -/* )) { */ - -/* if( path_switch->out */ -/* == ports_to_try[i]) { */ -/* used = 1; */ -/* break; */ -/* } */ -/* } */ -/* } */ -/* list_iterator_destroy(itr); */ - -/* if(curr_switch-> */ -/* ext_wire[ports_to_try[i]].node_tar[X] */ -/* == curr_switch->ext_wire[0].node_tar[X] */ -/* #ifdef HAVE_BG */ -/* && curr_switch-> */ -/* ext_wire[ports_to_try[i]].node_tar[Y] */ -/* == curr_switch->ext_wire[0].node_tar[Y] */ -/* && curr_switch-> */ -/* ext_wire[ports_to_try[i]].node_tar[Z] */ -/* == curr_switch->ext_wire[0].node_tar[Z] */ -/* #endif */ -/* ) { */ -/* continue; */ -/* } */ - -/* if(!used) { */ -/* port_tar = curr_switch-> */ -/* ext_wire[ports_to_try[i]].port_tar; */ -/* node_tar = curr_switch-> */ -/* ext_wire[ports_to_try[i]].node_tar; */ - -/* next_switch = &ba_system_ptr-> */ -/* grid[node_tar[X]] */ -/* #ifdef HAVE_BG */ -/* [node_tar[Y]] */ -/* [node_tar[Z]] */ -/* #endif */ -/* .axis_switch[X]; */ - -/* count++; */ -/* path_add->out = ports_to_try[i]; */ -/* list_push(path, path_add); */ -/* debug3("looking at this one " */ -/* "%c%c%c %d -> %c%c%c %d", */ -/* alpha_num[ba_node->coord[X]], */ -/* alpha_num[ba_node->coord[Y]], */ -/* alpha_num[ba_node->coord[Z]], */ -/* ports_to_try[i], */ -/* alpha_num[node_tar[X]], */ -/* alpha_num[node_tar[Y]], */ -/* alpha_num[node_tar[Z]], */ -/* port_tar); */ - -/* _find_passthrough(next_switch, port_tar, nodes, */ -/* dim, count, highest_phys_x); */ -/* while((temp_switch = list_pop(path)) */ -/* != path_add){ */ -/* xfree(temp_switch); */ -/* debug3("something here 2"); */ -/* } */ -/* } */ -/* } */ -/* } */ -/* xfree(path_add); */ -/* return 0; */ -/* } */ +/* + * Used to tie the end of the block to the start. best_path and path + * should both be set up before calling this function. + * + * IN: curr_switch - + * IN: source_port - + * IN: dim - + * IN: count - + * IN: start - + * + * RET: 0 on failure, 1 on success + * + * Sets up global variable best_path, and best_count. On success + * best_count will be >= BEST_COUNT_INIT you can call _set_best_path + * to apply this path to the main system (ba_system_ptr) + */ static int _finish_torus(ba_switch_t *curr_switch, int source_port, - List nodes, int dim, int count, int *start) + int dim, int count, int *start) { ba_switch_t *next_switch = NULL; - ba_path_switch_t *path_add = - (ba_path_switch_t *) xmalloc(sizeof(ba_path_switch_t)); + ba_path_switch_t *path_add = xmalloc(sizeof(ba_path_switch_t)); ba_path_switch_t *path_switch = NULL; ba_path_switch_t *temp_switch = NULL; int port_tar; @@ -4945,18 +4557,16 @@ static int _finish_torus(ba_switch_t *curr_switch, int source_port, target_port=1; if(!curr_switch->int_wire[target_port].used) { - list_destroy(best_path); - best_path = list_create(_delete_path_list); + list_flush(best_path); + found = true; path_add->out = target_port; list_push(path, path_add); itr = list_iterator_create(path); - while((path_switch = - (ba_path_switch_t*) list_next(itr))){ + while((path_switch = list_next(itr))) { - temp_switch = (ba_path_switch_t *) - xmalloc(sizeof(ba_path_switch_t)); + temp_switch = xmalloc(sizeof(ba_path_switch_t)); temp_switch->geometry[X] = path_switch->geometry[X]; @@ -4986,8 +4596,7 @@ static int _finish_torus(ba_switch_t *curr_switch, int source_port, used=0; if(!curr_switch->int_wire[ports_to_try[i]].used) { itr = list_iterator_create(path); - while((path_switch = - (ba_path_switch_t*) list_next(itr))){ + while((path_switch = list_next(itr))){ if(((path_switch->geometry[X] == node_src[X]) #ifdef HAVE_BG @@ -5022,8 +4631,7 @@ static int _finish_torus(ba_switch_t *curr_switch, int source_port, node_tar = curr_switch-> ext_wire[ports_to_try[i]].node_tar; - next_switch = &ba_system_ptr-> - grid[node_tar[X]] + next_switch = &ba_system_ptr->grid[node_tar[X]] #ifdef HAVE_BG [node_tar[Y]] [node_tar[Z]] @@ -5034,8 +4642,8 @@ static int _finish_torus(ba_switch_t *curr_switch, int source_port, count++; path_add->out = ports_to_try[i]; list_push(path, path_add); - _finish_torus(next_switch, port_tar, nodes, - dim, count, start); + _finish_torus(next_switch, port_tar, + dim, count, start); while((temp_switch = list_pop(path)) != path_add){ xfree(temp_switch); @@ -5048,14 +4656,22 @@ static int _finish_torus(ba_switch_t *curr_switch, int source_port, return 0; } +/* + * using best_path set up previously from _finish_torus or + * _find_next_free_using_port_2. Will set up the path contained there + * into the main virtual system. With will also set the passthrough + * flag if there was a passthrough used. + */ static int *_set_best_path() { ListIterator itr; ba_path_switch_t *path_switch = NULL; ba_switch_t *curr_switch = NULL; int *geo = NULL; + if(!best_path) return NULL; + itr = list_iterator_create(best_path); while((path_switch = (ba_path_switch_t*) list_next(itr))) { if(passthrough && path_switch->in > 1 && path_switch->out > 1) { @@ -5070,15 +4686,13 @@ static int *_set_best_path() path_switch->in, path_switch->out); if(!geo) geo = path_switch->geometry; - curr_switch = &ba_system_ptr-> - grid + curr_switch = &ba_system_ptr->grid [path_switch->geometry[X]] [path_switch->geometry[Y]] [path_switch->geometry[Z]]. axis_switch[path_switch->dim]; #else - curr_switch = &ba_system_ptr-> - grid[path_switch->geometry[X]]. + curr_switch = &ba_system_ptr->grid[path_switch->geometry[X]]. axis_switch[path_switch->dim]; #endif diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index a7435af07c4..f62594eede8 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -380,7 +380,7 @@ static bg_record_t *_find_matching_block(List block_list, test_only); itr = list_iterator_create(block_list); - while ((bg_record = (bg_record_t*) list_next(itr))) { + while ((bg_record = list_next(itr))) { /* If test_only we want to fall through to tell the scheduler that it is runnable just not right now. */ @@ -636,6 +636,8 @@ static int _check_for_booted_overlapping_blocks( * bg_record */ list_remove(bg_record_itr); + slurm_mutex_lock(&block_state_mutex); + if(bg_record->original) { debug3("This was a copy"); found_record = @@ -651,8 +653,10 @@ static int _check_for_booted_overlapping_blocks( } destroy_bg_record(bg_record); if(!found_record) { - error("1 this record wasn't " - "found in the list!"); + debug2("This record wasn't " + "found in the bg_list, " + "no big deal, it " + "probably wasn't added"); //rc = SLURM_ERROR; } else { List temp_list = @@ -663,6 +667,7 @@ static int _check_for_booted_overlapping_blocks( free_block_list(temp_list); list_destroy(temp_list); } + slurm_mutex_unlock(&block_state_mutex); } rc = 1; diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index d3ff7a7749d..8c2b9410510 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -488,7 +488,6 @@ extern bg_record_t *find_and_remove_org_from_bg_list(List my_list, if(bit_equal(bg_record->bitmap, found_record->bitmap) && bit_equal(bg_record->ionode_bitmap, found_record->ionode_bitmap)) { - if(!strcmp(bg_record->bg_block_id, found_record->bg_block_id)) { list_remove(itr); diff --git a/src/plugins/select/bluegene/plugin/slurm_prolog.c b/src/plugins/select/bluegene/plugin/slurm_prolog.c index af652b7879c..f4ad1d0201d 100644 --- a/src/plugins/select/bluegene/plugin/slurm_prolog.c +++ b/src/plugins/select/bluegene/plugin/slurm_prolog.c @@ -176,10 +176,11 @@ static int _get_job_size(uint32_t job_id) } /* - * Test if any BG blocks are in deallocating state + * Test if any BG blocks are in deallocating state since they are + * probably related to this job we will want to sleep longer * RET 1: deallocate in progress * 0: no deallocate in progress - * -1: error occurred + * -1: error occurred */ static int _partitions_dealloc() { diff --git a/src/sacct/options.c b/src/sacct/options.c index b7c2737fc28..da6921117a1 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -414,7 +414,7 @@ void _help_msg(void) "-c, --completion\n" " Use job completion instead of accounting data.\n" "-C, --cluster\n" - " Only send data about this cluster.\n" + " Only send data about this cluster -1 for all clusters.\n" "-d, --dump\n" " Dump the raw data records\n" "--duplicates\n" @@ -622,7 +622,7 @@ void parse_command_line(int argc, char **argv) char *dot = NULL; bool brief_output = FALSE, long_output = FALSE; bool all_users = 0; - + bool all_clusters = 1; static struct option long_options[] = { {"all", 0,0, 'a'}, {"accounts", 1, 0, 'A'}, @@ -690,6 +690,10 @@ void parse_command_line(int argc, char **argv) params.opt_completion = 1; break; case 'C': + if(!strcasecmp(optarg, "-1")) { + all_clusters = 1; + break; + } if(!params.opt_cluster_list) params.opt_cluster_list = list_create(slurm_destroy_char); @@ -934,7 +938,15 @@ void parse_command_line(int argc, char **argv) } /* specific clusters requested? */ - if (params.opt_verbose && params.opt_cluster_list + if(all_clusters) { + if(params.opt_cluster_list + && list_count(params.opt_cluster_list)) { + list_destroy(params.opt_cluster_list); + params.opt_cluster_list = NULL; + } + if(params.opt_verbose) + fprintf(stderr, "Clusters requested:\n\t: all\n"); + } else if (params.opt_verbose && params.opt_cluster_list && list_count(params.opt_cluster_list)) { fprintf(stderr, "Clusters requested:\n"); itr = list_iterator_create(params.opt_cluster_list); diff --git a/src/sacctmgr/user_functions.c b/src/sacctmgr/user_functions.c index d87cbb8f55d..66d4289610c 100644 --- a/src/sacctmgr/user_functions.c +++ b/src/sacctmgr/user_functions.c @@ -37,6 +37,7 @@ \*****************************************************************************/ #include "src/sacctmgr/sacctmgr.h" +#include "src/common/uid.h" static int _set_cond(int *start, int argc, char *argv[], acct_user_cond_t *user_cond, @@ -504,6 +505,7 @@ extern int sacctmgr_add_user(int argc, char *argv[]) while((name = list_next(itr))) { user = NULL; if(!sacctmgr_find_user_from_list(local_user_list, name)) { + uid_t pw_uid; if(!default_acct) { exit_code=1; fprintf(stderr, " Need a default account for " @@ -524,6 +526,22 @@ extern int sacctmgr_add_user(int argc, char *argv[]) } first = 0; } + pw_uid = uid_from_string(name); + if(pw_uid == (uid_t) -1) { + char *warning = xstrdup_printf( + "There is no uid for user '%s'" + "\nAre you sure you want to continue?", + name); + + if(!commit_check(warning)) { + xfree(warning); + rc = SLURM_ERROR; + list_flush(user_list); + goto no_default; + } + xfree(warning); + } + user = xmalloc(sizeof(acct_user_rec_t)); user->assoc_list = list_create(NULL); user->name = xstrdup(name); @@ -543,6 +561,7 @@ extern int sacctmgr_add_user(int argc, char *argv[]) } user->admin_level = admin_level; + xstrfmtcat(user_str, " %s\n", name); list_append(user_list, user); diff --git a/src/salloc/Makefile.am b/src/salloc/Makefile.am index e2da3019f5c..05fb0f3eb54 100644 --- a/src/salloc/Makefile.am +++ b/src/salloc/Makefile.am @@ -3,7 +3,7 @@ AUTOMAKE_OPTIONS = foreign CLEANFILES = core.* -INCLUDES = -I$(top_srcdir) +INCLUDES = -I$(top_srcdir) $(BG_INCLUDES) bin_PROGRAMS = salloc diff --git a/src/salloc/Makefile.in b/src/salloc/Makefile.in index 91ae04a4039..03315a66ab0 100644 --- a/src/salloc/Makefile.in +++ b/src/salloc/Makefile.in @@ -263,7 +263,7 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ AUTOMAKE_OPTIONS = foreign CLEANFILES = core.* -INCLUDES = -I$(top_srcdir) +INCLUDES = -I$(top_srcdir) $(BG_INCLUDES) salloc_SOURCES = salloc.c salloc.h opt.c opt.h convenience_libs = $(top_builddir)/src/api/libslurm.o -ldl salloc_LDADD = \ diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index de913220ffd..c1a48258353 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -50,6 +50,14 @@ #include "src/salloc/salloc.h" #include "src/salloc/opt.h" +#ifdef HAVE_BG +#include "src/api/job_info.h" +#include "src/api/node_select_info.h" +#include "src/common/node_select.h" +#include "src/plugins/select/bluegene/plugin/bg_boot_time.h" +#include "src/plugins/select/bluegene/wrap_rm_api.h" +#endif + #define MAX_RETRIES 3 char **command_argv; @@ -76,6 +84,16 @@ static void _user_msg_handler(srun_user_msg_t *msg); static void _ping_handler(srun_ping_msg_t *msg); static void _node_fail_handler(srun_node_fail_msg_t *msg); +#ifdef HAVE_BG + +#define POLL_SLEEP 3 /* retry interval in seconds */ + +static int _wait_bluegene_block_ready( + resource_allocation_response_msg_t *alloc); +static int _blocks_dealloc(); +#endif + + int main(int argc, char *argv[]) { log_options_t logopt = LOG_OPTS_STDERR_ONLY; @@ -187,6 +205,13 @@ int main(int argc, char *argv[]) * Allocation granted! */ info("Granted job allocation %d", alloc->job_id); +#ifdef HAVE_BG + if (!_wait_bluegene_block_ready(alloc)) { + error("Something is wrong with the boot of the block."); + goto relinquish; + } + +#endif if (opt.bell == BELL_ALWAYS || (opt.bell == BELL_AFTER_DELAY && ((after - before) > DEFAULT_BELL_DELAY))) { @@ -520,3 +545,93 @@ static void _node_fail_handler(srun_node_fail_msg_t *msg) { error("Node failure on %s", msg->nodelist); } + +#ifdef HAVE_BG +/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ +static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) +{ + int is_ready = 0, i, rc; + char *block_id = NULL; + int cur_delay = 0; + int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + + (BG_INCR_BLOCK_BOOT * alloc->node_cnt); + + select_g_get_jobinfo(alloc->select_jobinfo, SELECT_DATA_BLOCK_ID, + &block_id); + + for (i=0; (cur_delay < max_delay); i++) { + if(i == 1) + info("Waiting for block %s to become ready for job", + block_id); + if (i) { + sleep(POLL_SLEEP); + rc = _blocks_dealloc(); + if ((rc == 0) || (rc == -1)) + cur_delay += POLL_SLEEP; + debug("still waiting"); + } + + rc = slurm_job_node_ready(alloc->job_id); + + if (rc == READY_JOB_FATAL) + break; /* fatal error */ + if (rc == READY_JOB_ERROR) /* error */ + continue; /* retry */ + if ((rc & READY_JOB_STATE) == 0) /* job killed */ + break; + if (rc & READY_NODE_STATE) { /* job and node ready */ + is_ready = 1; + break; + } + } + + if (is_ready) + info("Block %s is ready for job", block_id); + else + error("Block %s still not ready", block_id); + xfree(block_id); + + return is_ready; +} + +/* + * Test if any BG blocks are in deallocating state since they are + * probably related to this job we will want to sleep longer + * RET 1: deallocate in progress + * 0: no deallocate in progress + * -1: error occurred + */ +static int _blocks_dealloc() +{ + static node_select_info_msg_t *bg_info_ptr = NULL, *new_bg_ptr = NULL; + int rc = 0, error_code = 0, i; + + if (bg_info_ptr) { + error_code = slurm_load_node_select(bg_info_ptr->last_update, + &new_bg_ptr); + if (error_code == SLURM_SUCCESS) + select_g_free_node_info(&bg_info_ptr); + else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { + error_code = SLURM_SUCCESS; + new_bg_ptr = bg_info_ptr; + } + } else { + error_code = slurm_load_node_select((time_t) NULL, &new_bg_ptr); + } + + if (error_code) { + error("slurm_load_partitions: %s\n", + slurm_strerror(slurm_get_errno())); + return -1; + } + for (i=0; i<new_bg_ptr->record_count; i++) { + if(new_bg_ptr->bg_info_array[i].state + == RM_PARTITION_DEALLOCATING) { + rc = 1; + break; + } + } + bg_info_ptr = new_bg_ptr; + return rc; +} +#endif diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 165d92c96f2..ff9a8313348 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -304,8 +304,13 @@ int main(int argc, char *argv[]) slurmctld_cluster_name = xstrdup(slurmctld_conf.cluster_name); accounting_enforce = slurmctld_conf.accounting_storage_enforce; acct_db_conn = acct_storage_g_get_connection(true, false); + + memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); assoc_init_arg.enforce = accounting_enforce; assoc_init_arg.remove_assoc_notify = _remove_assoc; + assoc_init_arg.refresh = 0; + assoc_init_arg.cache_level = ASSOC_MGR_CACHE_ALL; + if (assoc_mgr_init(acct_db_conn, &assoc_init_arg) && accounting_enforce) { error("assoc_mgr_init failure"); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b45b3d9e2e9..8dd391c7a75 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1878,6 +1878,7 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, xfree(job_ptr->state_desc); } else job_ptr->job_state = JOB_COMPLETE | job_comp_flag; + if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += @@ -4320,55 +4321,24 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } if (job_specs->account) { - if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { - info("update_job: attempt to modify account for " - "non-pending job_id %u", job_specs->job_id); - error_code = ESLURM_DISABLED; - } else { - acct_association_rec_t assoc_rec, *assoc_ptr; - bzero(&assoc_rec, sizeof(acct_association_rec_t)); - - assoc_rec.uid = job_ptr->user_id; - assoc_rec.partition = job_ptr->partition; - assoc_rec.acct = job_specs->account; - if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, - accounting_enforce, - &assoc_ptr)) { - info("job_update: invalid account %s for " - "job_id %u", - job_specs->account, job_ptr->job_id); - error_code = ESLURM_INVALID_ACCOUNT; - } else { - xfree(job_ptr->account); - if (assoc_rec.acct[0] != '\0') { - job_ptr->account = - xstrdup(assoc_rec.acct); - info("update_job: setting account to " - "%s for job_id %u", - assoc_rec.acct, job_ptr->job_id); - } else { - info("update_job: cleared account for " - "job_id %u", - job_specs->job_id); - } - job_ptr->assoc_id = assoc_rec.id; - job_ptr->assoc_ptr = (void *) assoc_ptr; - } - } + int rc = update_job_account("update_job", job_ptr, + job_specs->account); + if (rc != SLURM_SUCCESS) + error_code = rc; } if (job_specs->ntasks_per_node != (uint16_t) NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_DISABLED; else if (super_user) { - detail_ptr->ntasks_per_node = - job_specs->ntasks_per_node; + detail_ptr->ntasks_per_node = job_specs-> + ntasks_per_node; info("update_job: setting ntasks_per_node to %u for " "job_id %u", job_specs->ntasks_per_node, job_specs->job_id); } else { - error("Not super user: setting ntasks_oper_node to job %u", - job_specs->job_id); + error("Not super user: setting ntasks_oper_node to " + "job %u", job_specs->job_id); error_code = ESLURM_ACCESS_DENIED; } } @@ -5077,6 +5047,16 @@ extern void job_completion_logger(struct job_record *job_ptr) } g_slurm_jobcomp_write(job_ptr); + + /* + * This means the job wasn't ever eligible, but we want to + * keep track of all jobs, so we will set the db_inx to + * INFINITE and the database will understand what happened. + */ + if(!job_ptr->nodes && !job_ptr->db_index) { + jobacct_storage_g_job_start(acct_db_conn, job_ptr); + } + jobacct_storage_g_job_complete(acct_db_conn, job_ptr); } @@ -5686,7 +5666,8 @@ extern int job_cancel_by_assoc_id(uint32_t assoc_id) if ((job_ptr->assoc_id != assoc_id) || IS_JOB_FINISHED(job_ptr)) continue; - info("Association deleted, cancelling job %u", job_ptr->job_id); + info("Association deleted, cancelling job %u", + job_ptr->job_id); job_signal(job_ptr->job_id, SIGKILL, 0, 0); job_ptr->state_reason = FAIL_BANK_ACCOUNT; xfree(job_ptr->state_desc); @@ -5695,3 +5676,55 @@ extern int job_cancel_by_assoc_id(uint32_t assoc_id) list_iterator_destroy(job_iterator); return cnt; } + +/* + * Modify the account associated with a pending job + * IN module - where this is called from + * IN job_ptr - pointer to job which should be modified + * IN new_account - desired account name + * RET SLURM_SUCCESS or error code + */ +extern int update_job_account(char *module, struct job_record *job_ptr, + char *new_account) +{ + acct_association_rec_t assoc_rec, *assoc_ptr; + + if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { + info("%s: attempt to modify account for non-pending " + "job_id %u", module, job_ptr->job_id); + return ESLURM_DISABLED; + } + + + bzero(&assoc_rec, sizeof(acct_association_rec_t)); + assoc_rec.uid = job_ptr->user_id; + assoc_rec.partition = job_ptr->partition; + assoc_rec.acct = new_account; + if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, + accounting_enforce, &assoc_ptr)) { + info("%s: invalid account %s for job_id %u", + module, new_account, job_ptr->job_id); + return ESLURM_INVALID_ACCOUNT; + } + + + xfree(job_ptr->account); + if (assoc_rec.acct[0] != '\0') { + job_ptr->account = xstrdup(assoc_rec.acct); + info("%s: setting account to %s for job_id %u", + module, assoc_rec.acct, job_ptr->job_id); + } else { + info("%s: cleared account for job_id %u", + module, job_ptr->job_id); + } + job_ptr->assoc_id = assoc_rec.id; + job_ptr->assoc_ptr = (void *) assoc_ptr; + + if (job_ptr->details && job_ptr->details->begin_time) { + /* Update account associated with the eligible time */ + jobacct_storage_g_job_start(acct_db_conn, job_ptr); + } + last_job_update = time(NULL); + + return SLURM_SUCCESS; +} diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 713f98a0b34..7d36219eba6 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -548,7 +548,11 @@ extern int make_batch_job_cred(batch_job_launch_msg_t *launch_msg_ptr, cred_arg.jobid = launch_msg_ptr->job_id; cred_arg.stepid = launch_msg_ptr->step_id; cred_arg.uid = launch_msg_ptr->uid; +#ifdef HAVE_FRONT_END + cred_arg.hostlist = node_record_table_ptr[0].name; +#else cred_arg.hostlist = launch_msg_ptr->nodes; +#endif if (job_ptr->details == NULL) cred_arg.job_mem = 0; else if (job_ptr->details->job_min_memory & MEM_PER_CPU) { diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 6a977229a4d..6f4be965c34 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -519,8 +519,11 @@ static int _make_step_cred(struct step_record *step_rec, cred_arg.stepid = step_rec->step_id; cred_arg.uid = job_ptr->user_id; cred_arg.job_mem = job_ptr->details->job_min_memory; +#ifdef HAVE_FRONT_END + cred_arg.hostlist = node_record_table_ptr[0].name; +#else cred_arg.hostlist = step_rec->step_layout->node_list; - +#endif cred_arg.alloc_lps_cnt = job_ptr->alloc_lps_cnt; if ((cred_arg.alloc_lps_cnt > 0) && bit_equal(job_ptr->node_bitmap, step_rec->step_node_bitmap)) { @@ -554,8 +557,7 @@ static int _make_step_cred(struct step_record *step_rec, cred_arg.alloc_lps = NULL; } - *slurm_cred = slurm_cred_create(slurmctld_config.cred_ctx, - &cred_arg); + *slurm_cred = slurm_cred_create(slurmctld_config.cred_ctx, &cred_arg); xfree(cred_arg.alloc_lps); if (*slurm_cred == NULL) { error("slurm_cred_create error"); @@ -1742,7 +1744,8 @@ static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg) * represent the termination of an entire job */ static void _slurm_rpc_step_complete(slurm_msg_t *msg) { - int error_code = SLURM_SUCCESS, rc, rem, step_rc; + int error_code = SLURM_SUCCESS, rc, rem; + uint32_t step_rc; DEF_TIMERS; step_complete_msg_t *req = (step_complete_msg_t *)msg->data; /* Locks: Write job, write node */ @@ -1801,7 +1804,7 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) } } else { error_code = job_step_complete(req->job_id, req->job_step_id, - uid, job_requeue, step_rc); + uid, job_requeue, step_rc); unlock_slurmctld(job_write_lock); END_TIMER2("_slurm_rpc_step_complete"); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index dedbe812713..3895e6fc064 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1416,7 +1416,7 @@ extern bool step_on_node(struct job_record *job_ptr, * RET 0 on success, otherwise ESLURM error code */ extern int step_partial_comp(step_complete_msg_t *req, int *rem, - int *max_rc); + uint32_t *max_rc); /* Update time stamps for job step suspend */ extern void suspend_job_step(struct job_record *job_ptr); @@ -1438,6 +1438,16 @@ extern int sync_job_files(void); */ extern int update_job (job_desc_msg_t * job_specs, uid_t uid); +/* + * Modify the account associated with a pending job + * IN module - where this is called from + * IN job_ptr - pointer to job which should be modified + * IN new_account - desired account name + * RET SLURM_SUCCESS or error code + */ +extern int update_job_account(char *module, struct job_record *job_ptr, + char *new_account); + /* Reset nodes_completing field for all jobs */ extern void update_job_nodes_completing(void); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 5c2c459dbfb..952306bb0ac 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1468,7 +1468,7 @@ extern int job_step_checkpoint_task_comp(checkpoint_task_comp_msg_t *ckpt_ptr, * RET 0 on success, otherwise ESLURM error code */ extern int step_partial_comp(step_complete_msg_t *req, int *rem, - int *max_rc) + uint32_t *max_rc) { struct job_record *job_ptr; struct step_record *step_ptr; diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index e02697f002d..2863a5eeebf 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -589,9 +589,11 @@ _read_config() * valid aliases */ if (conf->node_name == NULL) conf->node_name = slurm_conf_get_aliased_nodename(); - if (conf->node_name == NULL) + + if (conf->node_name == NULL) conf->node_name = slurm_conf_get_nodename("localhost"); - if (conf->node_name == NULL) + + if (conf->node_name == NULL) fatal("Unable to determine this slurmd's NodeName"); _massage_pathname(&conf->logfile); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index b3a9378be46..4198bdad34a 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -233,9 +233,25 @@ static void _batch_finish(slurmd_job_t *job, int rc) { int i; - for (i = 0; i < job->ntasks; i++) + for (i = 0; i < job->ntasks; i++) { + /* If signalled we only need to check one and then + break out of the loop */ + if(WIFSIGNALED(job->task[i]->estatus)) { + switch(WTERMSIG(job->task[i]->estatus)) { + case SIGTERM: + case SIGKILL: + case SIGINT: + step_complete.step_rc = NO_VAL; + break; + default: + step_complete.step_rc = job->task[i]->estatus; + break; + } + break; + } step_complete.step_rc = MAX(step_complete.step_rc, WEXITSTATUS(job->task[i]->estatus)); + } if (job->argv[0] && (unlink(job->argv[0]) < 0)) error("unlink(%s): %m", job->argv[0]); @@ -494,10 +510,25 @@ _wait_for_children_slurmstepd(slurmd_job_t *job) } /* Find the maximum task return code */ - for (i = 0; i < job->ntasks; i++) + for (i = 0; i < job->ntasks; i++) { + /* If signalled we only need to check one and then + break out of the loop */ + if(WIFSIGNALED(job->task[i]->estatus)) { + switch(WTERMSIG(job->task[i]->estatus)) { + case SIGTERM: + case SIGKILL: + case SIGINT: + step_complete.step_rc = NO_VAL; + break; + default: + step_complete.step_rc = job->task[i]->estatus; + break; + } + break; + } step_complete.step_rc = MAX(step_complete.step_rc, - WEXITSTATUS(job->task[i]->estatus)); - + WEXITSTATUS(job->task[i]->estatus)); + } step_complete.wait_children = false; pthread_mutex_unlock(&step_complete.lock); diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index ebfbd430ac8..65f324d457c 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -174,10 +174,16 @@ _domain_socket_create(const char *dir, const char *nodename, * First check to see if the named socket already exists. */ if (stat(name, &stat_buf) == 0) { - error("Socket %s already exists", name); - xfree(name); - errno = ESLURMD_STEP_EXISTS; - return -1; + /* Vestigial from a slurmd crash or job requeue that did not + * happen properly (very rare conditions). Try another name */ + xstrcat(name, ".ALT"); + if (stat(name, &stat_buf) == 0) { + error("Socket %s already exists", name); + xfree(name); + errno = ESLURMD_STEP_EXISTS; + return -1; + } + error("Using alternate socket name %s", name); } fd = _create_socket(name); diff --git a/src/slurmdbd/slurmdbd.c b/src/slurmdbd/slurmdbd.c index ae2a361ca39..eb2580e48f7 100644 --- a/src/slurmdbd/slurmdbd.c +++ b/src/slurmdbd/slurmdbd.c @@ -97,6 +97,7 @@ int main(int argc, char *argv[]) pthread_attr_t thread_attr; char node_name[128]; void *db_conn = NULL; + assoc_init_args_t assoc_init_arg; _init_config(); log_init(argv[0], log_opts, LOG_DAEMON, NULL); @@ -139,7 +140,10 @@ int main(int argc, char *argv[]) db_conn = acct_storage_g_get_connection(false, false); - if(assoc_mgr_init(db_conn, NULL) == SLURM_ERROR) { + memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); + assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER; + + if(assoc_mgr_init(db_conn, &assoc_init_arg) == SLURM_ERROR) { error("Problem getting cache of data"); acct_storage_g_close_connection(&db_conn); goto end_it; diff --git a/src/smap/configure_functions.c b/src/smap/configure_functions.c index f899b0c3d71..80735294979 100644 --- a/src/smap/configure_functions.c +++ b/src/smap/configure_functions.c @@ -1276,7 +1276,7 @@ static void _print_text_command(allocated_block_t *allocated_block) main_xcord += 7; mvwprintw(text_win, main_ycord, - main_xcord, "%d",allocated_block->request->size); + main_xcord, "%d", allocated_block->request->size); main_xcord += 10; if(allocated_block->request->conn_type == SELECT_SMALL) { diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 5b5c53bd55b..5faf12f08b3 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -96,7 +96,7 @@ static void _signal_while_allocating(int signo) { destroy_job = 1; if (pending_job_id != 0) { - slurm_complete_job(pending_job_id, 0); + slurm_complete_job(pending_job_id, NO_VAL); } } diff --git a/src/srun/srun.c b/src/srun/srun.c index 1f9247d8a9c..d82a554d358 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -110,7 +110,7 @@ mpi_plugin_client_info_t mpi_job_info[1]; static struct termios termdefaults; -int global_rc; +uint32_t global_rc = 0; srun_job_t *job = NULL; struct { @@ -250,7 +250,6 @@ int srun(int ac, char **av) if (!job || create_job_step(job) < 0) exit(1); } else { - got_alloc = 1; /* Combined job allocation and job step launch */ #ifdef HAVE_FRONT_END uid_t my_uid = getuid(); @@ -263,9 +262,11 @@ int srun(int ac, char **av) if ( !(resp = allocate_nodes()) ) exit(1); + got_alloc = 1; _print_job_information(resp); _set_cpu_env_var(resp); job = job_create_allocation(resp); + opt.exclusive = false; /* not applicable for this step */ if (!job || create_job_step(job) < 0) { slurm_complete_job(job->jobid, 1); @@ -433,7 +434,7 @@ cleanup: _task_state_struct_free(); log_fini(); - return global_rc; + return (int)global_rc; } static int _call_spank_local_user (srun_job_t *job) @@ -953,7 +954,7 @@ _task_finish(task_exit_msg_t *msg) char buf[2048], *core_str = "", *msg_str, *node_list = NULL; static bool first_done = true; static bool first_error = true; - int rc = 0; + uint32_t rc = 0; int i; verbose("%u tasks finished (rc=%u)", @@ -975,7 +976,6 @@ _task_finish(task_exit_msg_t *msg) } } else if (WIFSIGNALED(msg->return_code)) { bit_or(task_state.finish_abnormal, tasks_exited); - rc = 1; msg_str = strsignal(WTERMSIG(msg->return_code)); #ifdef WCOREDUMP if (WCOREDUMP(msg->return_code)) @@ -983,9 +983,11 @@ _task_finish(task_exit_msg_t *msg) #endif node_list = _taskids_to_nodelist(tasks_exited); if (job->state >= SRUN_JOB_CANCELLED) { + rc = NO_VAL; verbose("%s: task %s: %s%s", node_list, buf, msg_str, core_str); } else { + rc = msg->return_code; error("%s: task %s: %s%s", node_list, buf, msg_str, core_str); } diff --git a/testsuite/expect/globals b/testsuite/expect/globals index 0126835ee00..a9003d3dfbc 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -1134,3 +1134,54 @@ proc check_acct_associations { } { log_user 1 return $rc } + +################################################################ +# +# +# +################################################################ +proc check_accounting_admin_level { } { + global sacctmgr alpha alpha_numeric_under bin_id + + set admin_level "" + + log_user 0 + + spawn $bin_id -u -n + expect { + -re "($alpha_numeric_under)" { + set user_name $expect_out(1,string) + exp_continue + } + eof { + wait + } + } + + if { ![string length $user_name] } { + send_user "FAILURE: No name returned from id\n" + return "" + } + + # + # Use sacctmgr to check admin_level + # + set s_pid [spawn $sacctmgr -n -P list user $user_name format=admin] + expect { + -re "($alpha)" { + set admin_level $expect_out(1,string) + exp_continue + } + timeout { + send_user "FAILURE: sacctmgr add not responding\n" + slow_kill $s_pid + set exit_code 1 + } + eof { + wait + } + } + + log_user 1 + return $admin_level +} diff --git a/testsuite/expect/test21.10 b/testsuite/expect/test21.10 index e40d6a52d9b..f74f24bb17b 100755 --- a/testsuite/expect/test21.10 +++ b/testsuite/expect/test21.10 @@ -70,11 +70,138 @@ set access_err 0 print_header $test_id +# +# Check accounting config and bail if not found. +# if { [test_account_storage] == 0 } { send_user "\nWARNING: This test can't be run without a usable AccountStorageType\n" exit 0 } - + +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + +# +# Use sacctmgr to delete the test cluster +# + set nothing 0 + set matches 0 + +set sadel_pid [spawn $sacctmgr -i $del $clu $tc1] + + expect { + -re "privilege to perform this action" { + set access_err 1 + exp_continue + } + -re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" { + send_user "FAILURE: there was a problem with the sacctmgr command\n" + incr exit_code 1 + } + -re "Problem getting" { + send_user "FAILURE: there was a problem getting information from the database\n" + incr exit_code 1 + } + -re "Problem adding" { + send_user "FAILURE: there was an unknwon problem\n" + incr exit_code 1 + } + -re "No associations" { + send_user "FAILURE: your command didn't return anything\n" + incr exit_code 1 + } + -re "Deleting clusters" { + incr matches + exp_continue + } + -re " Nothing deleted" { + incr matches + set nothing 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr delete not responding\n" + slow_kill $my_pid + incr exit_code 1 + } + eof { + wait + } + } + if {$access_err != 0} { + return 1 + } + if {$matches != 1} { + send_user "\nFAILURE: sacctmgr had a problem deleting cluster got $matches\n" + incr exit_code 1 + } + if { !$nothing } { + if { ![check_acct_associations] } { + send_user "\nFAILURE: Our associations don't line up\n" + incr exit_code 1 + } + } + +# +# Use sacctmgr to remove an account +# + + set matches 0 + set nothing 1 + set check "Deleting account" + + set my_pid [eval spawn $sacctmgr -i delete account $nm1] + expect { + -re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" { + send_user "FAILURE: there was a problem with the sacctmgr command\n" + incr exit_code 1 + } + -re "Problem getting" { + send_user "FAILURE: there was a problem getting information from the database\n" + incr exit_code 1 + } + -re "Problem adding" { + send_user "FAILURE: there was an unknwon problem\n" + incr exit_code 1 + } + -re "No associations" { + send_user "FAILURE: your command didn't return anything\n" + incr exit_code 1 + } + -re "$check" { + incr matches + exp_continue + } + -re " Nothing deleted" { + incr matches + set nothing 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + incr exit_code 1 + } + eof { + wait + } + } + + if {$matches != 1} { + send_user "\nFAILURE: sacctmgr had a problem deleting account. + got $matches\n" + incr exit_code 1 + } + + if { !$nothing } { + if { ![check_acct_associations] } { + send_user "\nFAILURE: Our associations don't line up\n" + incr exit_code 1 + } + } + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.11 b/testsuite/expect/test21.11 index 3411f0b80fa..99571a5925d 100755 --- a/testsuite/expect/test21.11 +++ b/testsuite/expect/test21.11 @@ -78,6 +78,14 @@ if { [test_account_storage] == 0 } { exit 0 } +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.12 b/testsuite/expect/test21.12 index a7a363bbe37..ed897c9542b 100755 --- a/testsuite/expect/test21.12 +++ b/testsuite/expect/test21.12 @@ -79,6 +79,14 @@ if { [test_account_storage] == 0 } { exit 0 } +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.13 b/testsuite/expect/test21.13 index c6b9074282d..4da00c44013 100755 --- a/testsuite/expect/test21.13 +++ b/testsuite/expect/test21.13 @@ -74,6 +74,13 @@ if { [test_account_storage] == 0 } { exit 0 } +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.14 b/testsuite/expect/test21.14 index b73c438cc4b..c6b138ebeb0 100755 --- a/testsuite/expect/test21.14 +++ b/testsuite/expect/test21.14 @@ -75,6 +75,11 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.15 b/testsuite/expect/test21.15 index 529e24a4cda..c6a7f0eeb68 100755 --- a/testsuite/expect/test21.15 +++ b/testsuite/expect/test21.15 @@ -102,6 +102,10 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.16 b/testsuite/expect/test21.16 index 5d318f87e60..4c8dfc49d6d 100755 --- a/testsuite/expect/test21.16 +++ b/testsuite/expect/test21.16 @@ -88,7 +88,7 @@ set mn maxnode set mw maxwall set dbu debug set access_err 0 - +#set user_name "id -u -n" print_header $test_id @@ -102,6 +102,10 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.17 b/testsuite/expect/test21.17 index e7376f3debc..21d001a5d1e 100755 --- a/testsuite/expect/test21.17 +++ b/testsuite/expect/test21.17 @@ -102,6 +102,11 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.18 b/testsuite/expect/test21.18 index 84662a10b0d..20c848f138f 100755 --- a/testsuite/expect/test21.18 +++ b/testsuite/expect/test21.18 @@ -102,6 +102,11 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.19 b/testsuite/expect/test21.19 index 4ef4e677330..2d62d68384b 100755 --- a/testsuite/expect/test21.19 +++ b/testsuite/expect/test21.19 @@ -102,6 +102,11 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster diff --git a/testsuite/expect/test21.5 b/testsuite/expect/test21.5 index 4110ad1184a..5daa909f2de 100755 --- a/testsuite/expect/test21.5 +++ b/testsuite/expect/test21.5 @@ -67,6 +67,11 @@ if { [test_account_storage] == 0 } { exit 0 } +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to delete the test cluster # diff --git a/testsuite/expect/test21.6 b/testsuite/expect/test21.6 index 3bfb62ac521..ac83876bc07 100755 --- a/testsuite/expect/test21.6 +++ b/testsuite/expect/test21.6 @@ -61,11 +61,80 @@ set access_err 0 print_header $test_id +# +# Check accounting config and bail if not found. +# if { [test_account_storage] == 0 } { send_user "\nWARNING: This test can't be run without a usable AccountStorageType\n" exit 0 } - + +if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + +# +# Use sacctmgr to delete the test cluster +# + set nothing 0 + set matches 0 + +set sadel_pid [spawn $sacctmgr -i $del $clu $tc1,$tc2,$tc3] + + expect { + -re "privilege to perform this action" { + set access_err 1 + exp_continue + } + -re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" { + send_user "FAILURE: there was a problem with the sacctmgr command\n" + incr exit_code 1 + } + -re "Problem getting" { + send_user "FAILURE: there was a problem getting information from the database\n" + incr exit_code 1 + } + -re "Problem adding" { + send_user "FAILURE: there was an unknwon problem\n" + incr exit_code 1 + } + -re "No associations" { + send_user "FAILURE: your command didn't return anything\n" + incr exit_code 1 + } + -re "Deleting clusters" { + incr matches + exp_continue + } + -re " Nothing deleted" { + incr matches + set nothing 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr delete not responding\n" + slow_kill $my_pid + incr exit_code 1 + } + eof { + wait + } + } + if {$access_err != 0} { + return 1 + } + if {$matches != 1} { + send_user "\nFAILURE: sacctmgr had a problem deleting cluster got $matches\n" + incr exit_code 1 + } + if { !$nothing } { + if { ![check_acct_associations] } { + send_user "\nFAILURE: Our associations don't line up\n" + incr exit_code 1 + } + } + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.7 b/testsuite/expect/test21.7 index 0f464fdd195..ee0067db4de 100755 --- a/testsuite/expect/test21.7 +++ b/testsuite/expect/test21.7 @@ -65,7 +65,15 @@ if { [test_account_storage] == 0 } { send_user "\nWARNING: This test can't be run without a usable AccountStorageType\n" exit 0 } - + +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.8 b/testsuite/expect/test21.8 index 6d414415c6a..e90f04bace4 100755 --- a/testsuite/expect/test21.8 +++ b/testsuite/expect/test21.8 @@ -72,6 +72,14 @@ if { [test_account_storage] == 0 } { exit 0 } +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test21.9 b/testsuite/expect/test21.9 index 4ec5194503a..bb5aad587da 100755 --- a/testsuite/expect/test21.9 +++ b/testsuite/expect/test21.9 @@ -72,6 +72,14 @@ if { [test_account_storage] == 0 } { exit 0 } +# +# Verify if Administrator privileges +# + if { [string compare [check_accounting_admin_level] "Administrator"] } { + send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse sacctmgr mod user \$USER_NAME admin=admin.\n" + exit 0 +} + # # Use sacctmgr to create a cluster # diff --git a/testsuite/expect/test7.3 b/testsuite/expect/test7.3 index 86fe053a947..07b72d80a21 100755 --- a/testsuite/expect/test7.3 +++ b/testsuite/expect/test7.3 @@ -57,7 +57,7 @@ send_user "slurm_dir is $slurm_dir\n" if {[test_aix]} { send_user "$bin_cc ${test_prog}.c -Wl,-brtl -g -pthread -o ${test_prog} -I${slurm_dir}/include -L${slurm_dir}/lib -lslurm -lntbl\n" exec $bin_cc ${test_prog}.c -Wl,-brtl -g -pthread -o ${test_prog} -I${slurm_dir}/include -L${slurm_dir}/lib -lslurm -lntbl -} elseif [file exists ${slurm_dir}/lib64] { +} elseif [file exists ${slurm_dir}/lib64/libslurm.so] { send_user "$bin_cc ${test_prog}.c -g -pthread -o ${test_prog} -I${slurm_dir}/include -Wl,--rpath=${slurm_dir}/lib64 -L${slurm_dir}/lib64 -lslurm\n" exec $bin_cc ${test_prog}.c -g -pthread -o ${test_prog} -I${slurm_dir}/include -Wl,--rpath=${slurm_dir}/lib64 -L${slurm_dir}/lib64 -lslurm } else { diff --git a/testsuite/expect/test7.7 b/testsuite/expect/test7.7 index c55a44bc164..7a90ba1cfe5 100755 --- a/testsuite/expect/test7.7 +++ b/testsuite/expect/test7.7 @@ -150,7 +150,7 @@ make_bash_script $file_in " echo BEGIN $bin_sleep 20 echo FINI - exit 123" + exit 0" set job_id1 0 set job_id2 0 set sbatch_pid [spawn $sbatch -N1-1024 --output=$file_out --comment=test -t1 $file_in] diff --git a/testsuite/expect/test7.7.prog.c b/testsuite/expect/test7.7.prog.c index cc8a536858b..81ba2a5e36d 100644 --- a/testsuite/expect/test7.7.prog.c +++ b/testsuite/expect/test7.7.prog.c @@ -25,6 +25,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ +#include <errno.h> #include <netdb.h> #include <stdio.h> #include <stdlib.h> @@ -71,7 +72,7 @@ static int _conn_wiki_port(char *host, int port) static int _conn_event_port(char *host, int port) { - int sock_fd; + int i, rc, sock_fd; struct sockaddr_in wiki_addr; struct hostent *hptr; @@ -88,11 +89,19 @@ static int _conn_event_port(char *host, int port) wiki_addr.sin_family = AF_INET; wiki_addr.sin_port = htons(port); memcpy(&wiki_addr.sin_addr.s_addr, hptr->h_addr, hptr->h_length); - if (bind(sock_fd, (struct sockaddr *) &wiki_addr, - sizeof(wiki_addr))) { - printf("WARNING: bind to port %i failed, may not be real error\n", - port); - return -1; + for (i=0; ; i++) { + if (i) + sleep(5); + rc = bind(sock_fd, (struct sockaddr *) &wiki_addr, + sizeof(wiki_addr)); + if (rc == 0) + break; + if ((errno != EINVAL) || (i > 5)) { + printf("WARNING: bind to port %i; %s\n", + port, strerror(errno)); + return -1; + } + printf("WARNING: port %i in use, retrying\n", port); } listen(sock_fd, 1); return sock_fd; @@ -179,7 +188,7 @@ static char *_recv_msg(int fd) return buf; } -static void _xmit(char *msg) +static int _xmit(char *msg) { int msg_len = strlen(msg); char *out_msg, *in_msg, sum[20], *sc_ptr; @@ -199,12 +208,11 @@ static void _xmit(char *msg) printf("recv:%s\n\n", in_msg); sc_ptr = strstr(in_msg, "SC="); sc = atoi(sc_ptr+3); - if (sc != 0) { + if (sc != 0) fprintf(stderr, "RPC failure\n"); - exit(1); - } free(in_msg); close(wiki_fd); + return sc; } static void _event_mgr(void) @@ -240,19 +248,22 @@ static void _get_jobs(void) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=%s", (uint32_t) now, "CMD=GETJOBS ARG=0:ALL"); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); /* Dump volitile data */ snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=GETJOBS ARG=%u:ALL", (uint32_t) now, (uint32_t) 1); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); /* Dump state only */ snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=GETJOBS ARG=%u:ALL", (uint32_t) now, (uint32_t) (now+2)); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _get_nodes(void) @@ -264,19 +275,22 @@ static void _get_nodes(void) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=%s", (uint32_t) now, "CMD=GETNODES ARG=0:ALL"); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); /* Dump volitile data */ snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=GETNODES ARG=%u:ALL", (uint32_t) now, (uint32_t) 1); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); /* Dump state only */ snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=GETNODES ARG=%u:ALL", (uint32_t) now, (uint32_t) (now+2)); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _cancel_job(long my_job_id) @@ -289,20 +303,32 @@ static void _cancel_job(long my_job_id) "TYPE=ADMIN " "COMMENT=\"cancel comment\" ", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _start_job(long my_job_id) { time_t now = time(NULL); char out_msg[128]; + int i, rc; snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=STARTJOB ARG=%ld " "COMMENT=\'start comment\' " "TASKLIST=", /* Empty TASKLIST means we don't care */ (uint32_t) now, my_job_id); - _xmit(out_msg); + + for (i=0; i<10; i++) { + if (i) + sleep(10); + rc = _xmit(out_msg); + if (rc == 0) + break; + /* Still completing after requeue */ + } + if (rc != 0) + exit(1); } static void _suspend_job(long my_job_id) @@ -313,7 +339,8 @@ static void _suspend_job(long my_job_id) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=SUSPENDJOB ARG=%ld", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _signal_job(long my_job_id) @@ -324,7 +351,8 @@ static void _signal_job(long my_job_id) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=SIGNALJOB ARG=%ld VALUE=URG", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _modify_job(long my_job_id) @@ -343,7 +371,8 @@ static void _modify_job(long my_job_id) /* "INVALID=123 " */ "TIMELIMIT=10 BANK=test_bank", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _notify_job(long my_job_id) @@ -355,7 +384,8 @@ static void _notify_job(long my_job_id) "TS=%u AUTH=root DT=CMD=NOTIFYJOB ARG=%ld " "MSG=this_is_a_test", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _resume_job(long my_job_id) @@ -366,7 +396,8 @@ static void _resume_job(long my_job_id) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=RESUMEJOB ARG=%ld", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _job_requeue(long my_job_id) @@ -377,7 +408,8 @@ static void _job_requeue(long my_job_id) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=REQUEUEJOB ARG=%ld", (uint32_t) now, my_job_id); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _job_will_run(long my_job_id) @@ -389,7 +421,8 @@ static void _job_will_run(long my_job_id) "TS=%u AUTH=root DT=CMD=JOBWILLRUN ARG=JOBID=%ld,%s", (uint32_t) now, my_job_id, ""); /* put available node list here */ - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _initialize(void) @@ -400,7 +433,8 @@ static void _initialize(void) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=INITIALIZE ARG=USEHOSTEXP=N EPORT=%u", (uint32_t) now, e_port); - _xmit(out_msg); + if (_xmit(out_msg)) + exit(1); } static void _single_msg(void) @@ -411,8 +445,10 @@ static void _single_msg(void) snprintf(out_msg, sizeof(out_msg), "TS=%u AUTH=root DT=CMD=%s", (uint32_t) now, - "JOBWILLRUN ARG=JOBID=65537,bgl[000x733] JOBID=65539,bgl[000x733] JOBID=65538,bgl[000x733]"); - _xmit(out_msg); + "JOBWILLRUN ARG=JOBID=65537,bgl[000x733] " + "JOBID=65539,bgl[000x733] JOBID=65538,bgl[000x733]"); + if (_xmit(out_msg)) + exit(1); } int main(int argc, char * argv[]) @@ -458,7 +494,7 @@ int main(int argc, char * argv[]) } _cancel_job(job_id+1); _job_requeue(job_id); /* Put job back into HELD state */ - sleep(15); + sleep(10); _start_job(job_id); _get_jobs(); #endif diff --git a/testsuite/expect/test7.8 b/testsuite/expect/test7.8 index 0713a576b7f..e170fa2f7ee 100755 --- a/testsuite/expect/test7.8 +++ b/testsuite/expect/test7.8 @@ -110,7 +110,7 @@ make_bash_script $file_in " echo BEGIN $bin_sleep 20 echo FINI - exit 123" + exit 0" set job_id1 0 set job_id2 0 -- GitLab