diff --git a/NEWS b/NEWS index 72a3cb815103103a409c58cbc9304549267e0cde..9be94d93f40f059a9b75fca24b6aa4a14254b6b0 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,9 @@ documents those changes that are of interest to users and admins. -- Moved safe_read/write to slurm_protocol_defs.h removing multiple copies. -- Remove vestigial functions slurm_allocate_resources_and_run() and slurm_free_resource_allocation_and_run_response_msg(). + -- Added support for different executable files and arguments by task based + upon a configuration file. See srun's --multi-prog option (based upon + work by Hongjia Cao, National University of Defense Technology). -- moved the way forward logic waited for fanout logic mostly eliminating problems with scalability issues. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b783e8e5a24aa339d7e8de4412028f90ba885492..7a431785ba549941a64f2c58d13544e955b45785 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -89,7 +89,7 @@ Run a job with different programs and different arguments for each task. In this case, the executable program specified is actually a configuration file specifying the executable and arguments for each task. See \fBMULTIPLE PROGRAM CONFIGURATION\fR -below for details on the configuration file contents. +below for details on the configuration file contents. .TP \fB\-\-begin\fR=\fItime\fR @@ -1034,6 +1034,8 @@ Task rank One or more task ranks to use this configuration. Multiple values may be comma separated. Ranges may be indicated with two numbers separated with a '\-'. +To indicate all tasks, specify a rank of '*' (in which case you probably +should not be using this option). .TP Executable The name of the program to execute. diff --git a/src/srun/Makefile.am b/src/srun/Makefile.am index 2f76bd5da5b2315d34519c5989745160371536d3..e80f4ae8d147b5ce88df7bc7a33a17a2db77645d 100644 --- a/src/srun/Makefile.am +++ b/src/srun/Makefile.am @@ -29,6 +29,7 @@ srun_SOURCES = \ allocate.h \ core-format.c \ core-format.h \ + multi_prog.c multi_prog.h \ srun.wrapper.c convenience_libs = \ diff --git a/src/srun/msg.c b/src/srun/msg.c index fff81bf4443aa2d15db37ab7af7430ee7bc5ab3f..898345e8574c92d8b045b19560c397f7bee92ce8 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -65,6 +65,7 @@ #include "src/srun/sigstr.h" #include "src/srun/attach.h" #include "src/srun/allocate.h" +#include "src/srun/multi_prog.h" #include "src/common/xstring.h" @@ -193,6 +194,8 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) /* if all tasks are now accounted for, set the debug state and call the Breakpoint */ if (tasks_recorded == job->step_layout->num_tasks) { + if (opt.multi_prog) + set_multi_name(ntasks); MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_Breakpoint(); if (opt.debugger_test) diff --git a/src/srun/multi_prog.c b/src/srun/multi_prog.c new file mode 100644 index 0000000000000000000000000000000000000000..b8f4ac16375f6cdb34624f6492381aa67cd89fbd --- /dev/null +++ b/src/srun/multi_prog.c @@ -0,0 +1,199 @@ +/*****************************************************************************\ + * multi_prog.c - executing program according to task rank + * set MPIR_PROCDESC accordingly + * + * NOTE: The logic could be eliminated if slurmstepd kept track of the + * executable name for each task and returned that inforatmion in a new + * launch response message (with multiple executable names). + ***************************************************************************** + * Produced at National University of Defense Technology (China) + * Written by Hongjia Cao <hjcao@nudt.edu.cn> + * and + * Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include "src/common/log.h" +#include "src/common/xassert.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" +#include "src/srun/attach.h" + +/* Given a program name, translate it to a fully qualified pathname + * as needed based upon the PATH environment variable */ +static char * +_build_path(char* fname) +{ + int i; + char *path_env = NULL, *dir, *ptrptr; + static char file_name[256], file_path[256]; /* return values */ + struct stat buf; + + /* make copy of file name (end at white space) */ + snprintf(file_name, sizeof(file_name), "%s", fname); + for (i=0; i<sizeof(file_name); i++) { + if (file_name[i] == '\0') + break; + if (!isspace(file_name[i])) + continue; + file_name[i] = '\0'; + break; + } + + /* check if already absolute path */ + if (file_name[0] == '/') + return file_name; + + /* search for the file using PATH environment variable */ + dir = getenv("PATH"); + if (!dir) { + error("No PATH environment variable"); + return NULL; + } + path_env = xstrdup(dir); + dir = strtok_r(path_env, ":", &ptrptr); + while (dir) { + snprintf(file_path, sizeof(file_path), "%s/%s", dir, file_name); + if (stat(file_path, &buf) == 0) + break; + dir = strtok_r(NULL, ":", &ptrptr); + } + if (dir == NULL) { /* not found */ + error("Could not find executable %s", file_name); + snprintf(file_path, sizeof(file_path), "%s", file_name); + } + xfree(path_env); + return file_path; +} + +static void +_set_range(int low_num, int high_num, char *exec_name) +{ + int i; + + for (i=low_num; i<=high_num; i++) { + MPIR_PROCDESC *tv; + tv = &MPIR_proctable[i]; + if (tv->executable_name) { + error("duplicate configuration for task %d ignored", + i); + } else + tv->executable_name = xstrdup(exec_name); + } +} + +static void +_set_exec_names(char *ranks, char *exec_name, int ntasks) +{ + char *range, *p, *ptrptr, *exec_path, *upper; + int low_num, high_num; + + if (ranks[0] == '*' && ranks[1] == '\0') { + low_num = 0; + high_num = ntasks -1; + _set_range(low_num, high_num, exec_name); + return; + } + exec_path = _build_path(exec_name); + + for (range = strtok_r(ranks, ",", &ptrptr); range != NULL; + range = strtok_r(NULL, ",", &ptrptr)) { + p = ranks; + while (*p != '\0' && isdigit (*p)) + p ++; + + if (*p == '\0') { /* single rank */ + low_num = MAX(0, atoi(range)); + high_num = MIN((ntasks-1), atoi(range)); + _set_range(low_num, high_num, exec_path); + } else if (*p == '-') { /* lower-upper */ + upper = ++ p; + while (isdigit (*p)) + p ++; + if (*p != '\0') { + error ("Invalid task range specification (%s) " + "ignored.", range); + continue; + } + low_num = MAX(0, atoi (range)); + high_num = MIN((ntasks-1), atoi(upper)); + _set_range(low_num, high_num, exec_path); + } else { + error ("Invalid task range specification (%s) ignored.", range); + } + } +} + +extern int +set_multi_name(int ntasks) +{ + FILE *config_fd; + char line[256]; + char *config_fname = NULL, *ranks, *exec_name, *p, *ptrptr; + int line_num = 0, i; + + for (i=0; i<ntasks; i++) { + MPIR_PROCDESC *tv; + tv = &MPIR_proctable[i]; + if (i == 0) + config_fname = tv->executable_name; + tv->executable_name = NULL; + } + + config_fd = fopen(config_fname, "r"); + if (config_fd == NULL) { + error("Unable to open configuration file %s", config_fname); + return -1; + } + while (fgets(line, sizeof(line), config_fd)) { + line_num ++; + if (strlen (line) >= (sizeof(line) - 1)) { + error ("Line %d of configuration file too long", + line_num); + return -1; + } + p = line; + while (*p != '\0' && isspace (*p)) /* remove leading spaces */ + p ++; + + if (*p == '#') /* only whole-line comments handled */ + continue; + + if (*p == '\0') /* blank line ignored */ + continue; + + ranks = strtok_r(p, " \t\n", &ptrptr); + exec_name = strtok_r(NULL, " \t\n", &ptrptr); + if (!ranks || !exec_name) { + error("Line %d is invalid", line_num); + return -1; + } + _set_exec_names(ranks, exec_name, ntasks); + } + return 0; +} diff --git a/src/srun/multi_prog.h b/src/srun/multi_prog.h new file mode 100644 index 0000000000000000000000000000000000000000..28bf865dfb55de6039b8ca37b00775f396bf0046 --- /dev/null +++ b/src/srun/multi_prog.h @@ -0,0 +1,37 @@ +/*****************************************************************************\ + * multi_prog.h - executing program according to task rank + * set MPIR_PROCDESC accordingly + ***************************************************************************** + * Produced at National University of Defense Technology (China) + * Written by Hongjia Cao <hjcao@nudt.edu.cn> + * and + * Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#ifndef _SRUN_MULTI_PROG_H +#define _SRUN_MULTI_PROG_H + +/* set global MPIR_PROCDESC executable names based upon multi-program + * configuration file */ +extern int set_multi_name(int ntasks); + +#endif + diff --git a/testsuite/expect/test1.54 b/testsuite/expect/test1.54 index 0d18e788460922a8105d6d3061618dab8acd475d..c545951152674a4123cb1e8336747d93d4df1a25 100755 --- a/testsuite/expect/test1.54 +++ b/testsuite/expect/test1.54 @@ -90,11 +90,64 @@ expect { wait } } - if {$matches != 4} { send_user "\nFAILURE: Did not get expected multi-program output\n" set exit_code 1 } +if {$exit_code != 0} { + exit $exit_code +} + +# +# Submit a slurm job that will execute different executables and check debug info +# +# Timeout is max_job_delay (to spawn task) + +# 60 (job time limit) + +# 60 (slurmctld time limit check poll interval) + +# KillWait +# +set timeout [expr $max_job_delay + 60 + 60 + 60] + +exec $bin_rm -f $file_in +set file [open $file_in "w"] +puts $file "# multi-program configuration file +1-2 /bin/hostname +0,3 /bin/date +" +close $file +exec $bin_chmod 700 $file_in + +set matches 0 +set timed_out 0 +spawn $srun -N1 -n4 --overcommit -l -t1 --multi-prog --debugger-test $file_in +expect { + -re "executable:(/bin/)($alpha)" { + if {[string compare $expect_out(2,string) "date"] != 0} { + incr matches + } + if {[string compare $expect_out(2,string) "hostname"] != 0} { + incr matches + } + exp_continue + } + timeout { + send_user "\nFAILURE: srun not responding\n" + kill_srun + set exit_code 1 + exp_continue + } + eof { + wait + } +} +if {$timed_out == 1} { + send_user "\nEarly termination is expected, no worries.\n" +} +if {$matches != 4} { + send_user "\nFAILURE: did not generate full list of executables.\n" + set exit_code 1 +} + if {$exit_code == 0} { exec $bin_rm $file_in send_user "\nSUCCESS\n"