Skip to content
Snippets Groups Projects
Commit b9102c9b authored by Moe Jette's avatar Moe Jette
Browse files

Added support for debuggers in conjunction with srun --multi-prog option.

parent 3a902f5c
No related branches found
No related tags found
No related merge requests found
...@@ -13,6 +13,9 @@ documents those changes that are of interest to users and admins. ...@@ -13,6 +13,9 @@ documents those changes that are of interest to users and admins.
-- Moved safe_read/write to slurm_protocol_defs.h removing multiple copies. -- Moved safe_read/write to slurm_protocol_defs.h removing multiple copies.
-- Remove vestigial functions slurm_allocate_resources_and_run() and -- Remove vestigial functions slurm_allocate_resources_and_run() and
slurm_free_resource_allocation_and_run_response_msg(). slurm_free_resource_allocation_and_run_response_msg().
-- Added support for different executable files and arguments by task based
upon a configuration file. See srun's --multi-prog option (based upon
work by Hongjia Cao, National University of Defense Technology).
-- moved the way forward logic waited for fanout logic mostly eliminating -- moved the way forward logic waited for fanout logic mostly eliminating
problems with scalability issues. problems with scalability issues.
......
...@@ -89,7 +89,7 @@ Run a job with different programs and different arguments for ...@@ -89,7 +89,7 @@ Run a job with different programs and different arguments for
each task. In this case, the executable program specified is each task. In this case, the executable program specified is
actually a configuration file specifying the executable and actually a configuration file specifying the executable and
arguments for each task. See \fBMULTIPLE PROGRAM CONFIGURATION\fR arguments for each task. See \fBMULTIPLE PROGRAM CONFIGURATION\fR
below for details on the configuration file contents. below for details on the configuration file contents.
.TP .TP
\fB\-\-begin\fR=\fItime\fR \fB\-\-begin\fR=\fItime\fR
...@@ -1034,6 +1034,8 @@ Task rank ...@@ -1034,6 +1034,8 @@ Task rank
One or more task ranks to use this configuration. One or more task ranks to use this configuration.
Multiple values may be comma separated. Multiple values may be comma separated.
Ranges may be indicated with two numbers separated with a '\-'. Ranges may be indicated with two numbers separated with a '\-'.
To indicate all tasks, specify a rank of '*' (in which case you probably
should not be using this option).
.TP .TP
Executable Executable
The name of the program to execute. The name of the program to execute.
......
...@@ -29,6 +29,7 @@ srun_SOURCES = \ ...@@ -29,6 +29,7 @@ srun_SOURCES = \
allocate.h \ allocate.h \
core-format.c \ core-format.c \
core-format.h \ core-format.h \
multi_prog.c multi_prog.h \
srun.wrapper.c srun.wrapper.c
convenience_libs = \ convenience_libs = \
......
...@@ -65,6 +65,7 @@ ...@@ -65,6 +65,7 @@
#include "src/srun/sigstr.h" #include "src/srun/sigstr.h"
#include "src/srun/attach.h" #include "src/srun/attach.h"
#include "src/srun/allocate.h" #include "src/srun/allocate.h"
#include "src/srun/multi_prog.h"
#include "src/common/xstring.h" #include "src/common/xstring.h"
...@@ -193,6 +194,8 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) ...@@ -193,6 +194,8 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job)
/* if all tasks are now accounted for, set the debug state and /* if all tasks are now accounted for, set the debug state and
call the Breakpoint */ call the Breakpoint */
if (tasks_recorded == job->step_layout->num_tasks) { if (tasks_recorded == job->step_layout->num_tasks) {
if (opt.multi_prog)
set_multi_name(ntasks);
MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_debug_state = MPIR_DEBUG_SPAWNED;
MPIR_Breakpoint(); MPIR_Breakpoint();
if (opt.debugger_test) if (opt.debugger_test)
......
/*****************************************************************************\
* multi_prog.c - executing program according to task rank
* set MPIR_PROCDESC accordingly
*
* NOTE: The logic could be eliminated if slurmstepd kept track of the
* executable name for each task and returned that inforatmion in a new
* launch response message (with multiple executable names).
*****************************************************************************
* Produced at National University of Defense Technology (China)
* Written by Hongjia Cao <hjcao@nudt.edu.cn>
* and
* Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>.
* UCRL-CODE-217948.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
\*****************************************************************************/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "src/common/log.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/srun/attach.h"
/* Given a program name, translate it to a fully qualified pathname
* as needed based upon the PATH environment variable */
static char *
_build_path(char* fname)
{
int i;
char *path_env = NULL, *dir, *ptrptr;
static char file_name[256], file_path[256]; /* return values */
struct stat buf;
/* make copy of file name (end at white space) */
snprintf(file_name, sizeof(file_name), "%s", fname);
for (i=0; i<sizeof(file_name); i++) {
if (file_name[i] == '\0')
break;
if (!isspace(file_name[i]))
continue;
file_name[i] = '\0';
break;
}
/* check if already absolute path */
if (file_name[0] == '/')
return file_name;
/* search for the file using PATH environment variable */
dir = getenv("PATH");
if (!dir) {
error("No PATH environment variable");
return NULL;
}
path_env = xstrdup(dir);
dir = strtok_r(path_env, ":", &ptrptr);
while (dir) {
snprintf(file_path, sizeof(file_path), "%s/%s", dir, file_name);
if (stat(file_path, &buf) == 0)
break;
dir = strtok_r(NULL, ":", &ptrptr);
}
if (dir == NULL) { /* not found */
error("Could not find executable %s", file_name);
snprintf(file_path, sizeof(file_path), "%s", file_name);
}
xfree(path_env);
return file_path;
}
static void
_set_range(int low_num, int high_num, char *exec_name)
{
int i;
for (i=low_num; i<=high_num; i++) {
MPIR_PROCDESC *tv;
tv = &MPIR_proctable[i];
if (tv->executable_name) {
error("duplicate configuration for task %d ignored",
i);
} else
tv->executable_name = xstrdup(exec_name);
}
}
static void
_set_exec_names(char *ranks, char *exec_name, int ntasks)
{
char *range, *p, *ptrptr, *exec_path, *upper;
int low_num, high_num;
if (ranks[0] == '*' && ranks[1] == '\0') {
low_num = 0;
high_num = ntasks -1;
_set_range(low_num, high_num, exec_name);
return;
}
exec_path = _build_path(exec_name);
for (range = strtok_r(ranks, ",", &ptrptr); range != NULL;
range = strtok_r(NULL, ",", &ptrptr)) {
p = ranks;
while (*p != '\0' && isdigit (*p))
p ++;
if (*p == '\0') { /* single rank */
low_num = MAX(0, atoi(range));
high_num = MIN((ntasks-1), atoi(range));
_set_range(low_num, high_num, exec_path);
} else if (*p == '-') { /* lower-upper */
upper = ++ p;
while (isdigit (*p))
p ++;
if (*p != '\0') {
error ("Invalid task range specification (%s) "
"ignored.", range);
continue;
}
low_num = MAX(0, atoi (range));
high_num = MIN((ntasks-1), atoi(upper));
_set_range(low_num, high_num, exec_path);
} else {
error ("Invalid task range specification (%s) ignored.", range);
}
}
}
extern int
set_multi_name(int ntasks)
{
FILE *config_fd;
char line[256];
char *config_fname = NULL, *ranks, *exec_name, *p, *ptrptr;
int line_num = 0, i;
for (i=0; i<ntasks; i++) {
MPIR_PROCDESC *tv;
tv = &MPIR_proctable[i];
if (i == 0)
config_fname = tv->executable_name;
tv->executable_name = NULL;
}
config_fd = fopen(config_fname, "r");
if (config_fd == NULL) {
error("Unable to open configuration file %s", config_fname);
return -1;
}
while (fgets(line, sizeof(line), config_fd)) {
line_num ++;
if (strlen (line) >= (sizeof(line) - 1)) {
error ("Line %d of configuration file too long",
line_num);
return -1;
}
p = line;
while (*p != '\0' && isspace (*p)) /* remove leading spaces */
p ++;
if (*p == '#') /* only whole-line comments handled */
continue;
if (*p == '\0') /* blank line ignored */
continue;
ranks = strtok_r(p, " \t\n", &ptrptr);
exec_name = strtok_r(NULL, " \t\n", &ptrptr);
if (!ranks || !exec_name) {
error("Line %d is invalid", line_num);
return -1;
}
_set_exec_names(ranks, exec_name, ntasks);
}
return 0;
}
/*****************************************************************************\
* multi_prog.h - executing program according to task rank
* set MPIR_PROCDESC accordingly
*****************************************************************************
* Produced at National University of Defense Technology (China)
* Written by Hongjia Cao <hjcao@nudt.edu.cn>
* and
* Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>.
* UCRL-CODE-217948.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
\*****************************************************************************/
#ifndef _SRUN_MULTI_PROG_H
#define _SRUN_MULTI_PROG_H
/* set global MPIR_PROCDESC executable names based upon multi-program
* configuration file */
extern int set_multi_name(int ntasks);
#endif
...@@ -90,11 +90,64 @@ expect { ...@@ -90,11 +90,64 @@ expect {
wait wait
} }
} }
if {$matches != 4} { if {$matches != 4} {
send_user "\nFAILURE: Did not get expected multi-program output\n" send_user "\nFAILURE: Did not get expected multi-program output\n"
set exit_code 1 set exit_code 1
} }
if {$exit_code != 0} {
exit $exit_code
}
#
# Submit a slurm job that will execute different executables and check debug info
#
# Timeout is max_job_delay (to spawn task) +
# 60 (job time limit) +
# 60 (slurmctld time limit check poll interval) +
# KillWait
#
set timeout [expr $max_job_delay + 60 + 60 + 60]
exec $bin_rm -f $file_in
set file [open $file_in "w"]
puts $file "# multi-program configuration file
1-2 /bin/hostname
0,3 /bin/date
"
close $file
exec $bin_chmod 700 $file_in
set matches 0
set timed_out 0
spawn $srun -N1 -n4 --overcommit -l -t1 --multi-prog --debugger-test $file_in
expect {
-re "executable:(/bin/)($alpha)" {
if {[string compare $expect_out(2,string) "date"] != 0} {
incr matches
}
if {[string compare $expect_out(2,string) "hostname"] != 0} {
incr matches
}
exp_continue
}
timeout {
send_user "\nFAILURE: srun not responding\n"
kill_srun
set exit_code 1
exp_continue
}
eof {
wait
}
}
if {$timed_out == 1} {
send_user "\nEarly termination is expected, no worries.\n"
}
if {$matches != 4} {
send_user "\nFAILURE: did not generate full list of executables.\n"
set exit_code 1
}
if {$exit_code == 0} { if {$exit_code == 0} {
exec $bin_rm $file_in exec $bin_rm $file_in
send_user "\nSUCCESS\n" send_user "\nSUCCESS\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment