From a465ff2bdea308889b9ecc1f36a09be72f17ff99 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 22 Aug 2007 18:32:24 +0000 Subject: [PATCH] svn merge -r12023:12089 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2 --- NEWS | 5 + configure | 3 +- configure.ac | 1 + contribs/torque/mpiexec.pl | 90 ++- contribs/torque/qsub.pl | 38 +- doc/html/checkpoint_plugins.shtml | 67 +-- doc/html/configurator.html.in | 1 + doc/html/documentation.shtml | 2 +- doc/html/maui.shtml | 12 +- doc/html/moab.shtml | 22 +- doc/man/man1/srun.1 | 7 + doc/man/man5/slurm.conf.5 | 5 +- doc/man/man5/wiki.conf.5 | 32 +- slurm.spec | 1 + src/api/pmi_server.c | 31 +- src/api/slurm_pmi.c | 35 +- src/common/forward.c | 13 +- src/common/forward.h | 2 - src/common/slurm_protocol_api.c | 28 +- src/common/slurm_protocol_defs.c | 17 +- src/common/slurm_protocol_defs.h | 9 + src/common/slurm_protocol_pack.c | 40 ++ src/plugins/checkpoint/Makefile.am | 2 +- src/plugins/checkpoint/Makefile.in | 2 +- src/plugins/checkpoint/aix/checkpoint_aix.c | 16 +- src/plugins/checkpoint/ompi/Makefile.am | 13 + src/plugins/checkpoint/ompi/Makefile.in | 549 ++++++++++++++++++ src/plugins/checkpoint/ompi/checkpoint_ompi.c | 309 ++++++++++ src/plugins/mpi/mpichmx/Makefile.in | 7 + src/plugins/sched/wiki2/hostlist.c | 78 +-- src/scontrol/update_job.c | 2 +- src/slurmctld/agent.c | 8 +- src/slurmctld/srun_comm.c | 38 +- src/slurmctld/srun_comm.h | 11 +- .../slurmstepd/step_terminate_monitor.c | 4 +- src/srun/msg.c | 94 +++ src/srun/srun.c | 3 +- src/srun/srun.h | 5 + src/srun/srun_job.c | 1 - testsuite/expect/test7.2.prog.c | 2 +- 40 files changed, 1454 insertions(+), 151 deletions(-) mode change 100644 => 100755 contribs/torque/qsub.pl create mode 100644 src/plugins/checkpoint/ompi/Makefile.am create mode 100644 src/plugins/checkpoint/ompi/Makefile.in create mode 100644 src/plugins/checkpoint/ompi/checkpoint_ompi.c diff --git a/NEWS b/NEWS index 358e6e13240..6e5bb4eebb8 100644 --- a/NEWS +++ b/NEWS @@ -43,6 +43,11 @@ documents those changes that are of interest to users and admins. be directly schedule by Slurm without Moab control -- Optimize load leveling for shared nodes (alloc.patch, contributed by Chris Holmes, HP). + -- Added PMI_TIME environment variable for user to control how PMI + communications are spread out in time. See "man srun" for details. + -- Added PMI timing information to srun debug mode to aid in tuning. + Use "srun -vv ..." to see the information. + -- Added checkpoint/ompi (OpenMPI) plugin (still under development). * Changes in SLURM 1.2.13 ========================= diff --git a/configure b/configure index a74ad03255e..e14e4b2134f 100755 --- a/configure +++ b/configure @@ -26612,7 +26612,7 @@ _ACEOF -ac_config_files="$ac_config_files Makefile config.xml auxdir/Makefile contribs/Makefile contribs/perlapi/Makefile contribs/perlapi/libslurm-perl/Makefile.PL contribs/torque/Makefile src/Makefile src/api/Makefile src/common/Makefile src/sacct/Makefile src/salloc/Makefile src/sbatch/Makefile src/sattach/Makefile src/srun/Makefile src/slaunch/Makefile src/slurmd/Makefile src/slurmd/slurmd/Makefile src/slurmd/slurmstepd/Makefile src/slurmctld/Makefile src/sbcast/Makefile src/scontrol/Makefile src/scancel/Makefile src/squeue/Makefile src/sinfo/Makefile src/smap/Makefile src/strigger/Makefile src/sview/Makefile src/plugins/Makefile src/plugins/auth/Makefile src/plugins/auth/authd/Makefile src/plugins/auth/munge/Makefile src/plugins/auth/none/Makefile src/plugins/checkpoint/Makefile src/plugins/checkpoint/aix/Makefile src/plugins/checkpoint/none/Makefile src/plugins/crypto/Makefile src/plugins/crypto/munge/Makefile src/plugins/crypto/openssl/Makefile src/plugins/jobacct/Makefile src/plugins/jobacct/linux/Makefile src/plugins/jobacct/aix/Makefile src/plugins/jobacct/none/Makefile src/plugins/jobcomp/Makefile src/plugins/jobcomp/filetxt/Makefile src/plugins/jobcomp/none/Makefile src/plugins/jobcomp/script/Makefile src/plugins/jobcomp/database/Makefile src/plugins/proctrack/Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile src/plugins/proctrack/linuxproc/Makefile src/plugins/proctrack/rms/Makefile src/plugins/proctrack/sgi_job/Makefile src/plugins/sched/Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile src/plugins/sched/gang/Makefile src/plugins/sched/hold/Makefile src/plugins/sched/wiki/Makefile src/plugins/sched/wiki2/Makefile src/plugins/select/Makefile src/plugins/select/bluegene/Makefile src/plugins/select/bluegene/block_allocator/Makefile src/plugins/select/bluegene/plugin/Makefile src/plugins/select/linear/Makefile src/plugins/select/cons_res/Makefile src/plugins/switch/Makefile src/plugins/switch/elan/Makefile src/plugins/switch/none/Makefile src/plugins/switch/federation/Makefile src/plugins/mpi/Makefile src/plugins/mpi/mpich1_p4/Makefile src/plugins/mpi/mpich1_shmem/Makefile src/plugins/mpi/mpichgm/Makefile src/plugins/mpi/mpichmx/Makefile src/plugins/mpi/mvapich/Makefile src/plugins/mpi/lam/Makefile src/plugins/mpi/none/Makefile src/plugins/mpi/openmpi/Makefile src/plugins/task/Makefile src/plugins/task/affinity/Makefile src/plugins/task/none/Makefile src/plugins/database/Makefile src/plugins/database/flatfile/Makefile src/plugins/database/mysql/Makefile src/plugins/database/pgsql/Makefile doc/Makefile doc/man/Makefile doc/html/Makefile doc/html/configurator.html testsuite/Makefile testsuite/expect/Makefile testsuite/slurm_unit/Makefile testsuite/slurm_unit/common/Makefile testsuite/slurm_unit/slurmctld/Makefile testsuite/slurm_unit/slurmd/Makefile testsuite/slurm_unit/api/Makefile testsuite/slurm_unit/api/manual/Makefile" +ac_config_files="$ac_config_files Makefile config.xml auxdir/Makefile contribs/Makefile contribs/perlapi/Makefile contribs/perlapi/libslurm-perl/Makefile.PL contribs/torque/Makefile src/Makefile src/api/Makefile src/common/Makefile src/sacct/Makefile src/salloc/Makefile src/sbatch/Makefile src/sattach/Makefile src/srun/Makefile src/slaunch/Makefile src/slurmd/Makefile src/slurmd/slurmd/Makefile src/slurmd/slurmstepd/Makefile src/slurmctld/Makefile src/sbcast/Makefile src/scontrol/Makefile src/scancel/Makefile src/squeue/Makefile src/sinfo/Makefile src/smap/Makefile src/strigger/Makefile src/sview/Makefile src/plugins/Makefile src/plugins/auth/Makefile src/plugins/auth/authd/Makefile src/plugins/auth/munge/Makefile src/plugins/auth/none/Makefile src/plugins/checkpoint/Makefile src/plugins/checkpoint/aix/Makefile src/plugins/checkpoint/none/Makefile src/plugins/checkpoint/ompi/Makefile src/plugins/crypto/Makefile src/plugins/crypto/munge/Makefile src/plugins/crypto/openssl/Makefile src/plugins/jobacct/Makefile src/plugins/jobacct/linux/Makefile src/plugins/jobacct/aix/Makefile src/plugins/jobacct/none/Makefile src/plugins/jobcomp/Makefile src/plugins/jobcomp/filetxt/Makefile src/plugins/jobcomp/none/Makefile src/plugins/jobcomp/script/Makefile src/plugins/jobcomp/database/Makefile src/plugins/proctrack/Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile src/plugins/proctrack/linuxproc/Makefile src/plugins/proctrack/rms/Makefile src/plugins/proctrack/sgi_job/Makefile src/plugins/sched/Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile src/plugins/sched/gang/Makefile src/plugins/sched/hold/Makefile src/plugins/sched/wiki/Makefile src/plugins/sched/wiki2/Makefile src/plugins/select/Makefile src/plugins/select/bluegene/Makefile src/plugins/select/bluegene/block_allocator/Makefile src/plugins/select/bluegene/plugin/Makefile src/plugins/select/linear/Makefile src/plugins/select/cons_res/Makefile src/plugins/switch/Makefile src/plugins/switch/elan/Makefile src/plugins/switch/none/Makefile src/plugins/switch/federation/Makefile src/plugins/mpi/Makefile src/plugins/mpi/mpich1_p4/Makefile src/plugins/mpi/mpich1_shmem/Makefile src/plugins/mpi/mpichgm/Makefile src/plugins/mpi/mpichmx/Makefile src/plugins/mpi/mvapich/Makefile src/plugins/mpi/lam/Makefile src/plugins/mpi/none/Makefile src/plugins/mpi/openmpi/Makefile src/plugins/task/Makefile src/plugins/task/affinity/Makefile src/plugins/task/none/Makefile src/plugins/database/Makefile src/plugins/database/flatfile/Makefile src/plugins/database/mysql/Makefile src/plugins/database/pgsql/Makefile doc/Makefile doc/man/Makefile doc/html/Makefile doc/html/configurator.html testsuite/Makefile testsuite/expect/Makefile testsuite/slurm_unit/Makefile testsuite/slurm_unit/common/Makefile testsuite/slurm_unit/slurmctld/Makefile testsuite/slurm_unit/slurmd/Makefile testsuite/slurm_unit/api/Makefile testsuite/slurm_unit/api/manual/Makefile" cat >confcache <<\_ACEOF @@ -27364,6 +27364,7 @@ do "src/plugins/checkpoint/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/checkpoint/Makefile" ;; "src/plugins/checkpoint/aix/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/checkpoint/aix/Makefile" ;; "src/plugins/checkpoint/none/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/checkpoint/none/Makefile" ;; + "src/plugins/checkpoint/ompi/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/checkpoint/ompi/Makefile" ;; "src/plugins/crypto/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/crypto/Makefile" ;; "src/plugins/crypto/munge/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/crypto/munge/Makefile" ;; "src/plugins/crypto/openssl/Makefile") CONFIG_FILES="$CONFIG_FILES src/plugins/crypto/openssl/Makefile" ;; diff --git a/configure.ac b/configure.ac index cecb74a6dd2..5965834bc05 100644 --- a/configure.ac +++ b/configure.ac @@ -279,6 +279,7 @@ AC_CONFIG_FILES([Makefile src/plugins/checkpoint/Makefile src/plugins/checkpoint/aix/Makefile src/plugins/checkpoint/none/Makefile + src/plugins/checkpoint/ompi/Makefile src/plugins/crypto/Makefile src/plugins/crypto/munge/Makefile src/plugins/crypto/openssl/Makefile diff --git a/contribs/torque/mpiexec.pl b/contribs/torque/mpiexec.pl index d9b7e31cdd9..f468966e742 100755 --- a/contribs/torque/mpiexec.pl +++ b/contribs/torque/mpiexec.pl @@ -42,7 +42,7 @@ use strict; use FindBin; -use Getopt::Long 2.24 qw(:config no_ignore_case); +use Getopt::Long 2.24 qw(:config no_ignore_case require_order); use lib "${FindBin::Bin}/../lib/perl"; use autouse 'Pod::Usage' => qw(pod2usage); use Slurm ':all'; @@ -50,10 +50,57 @@ use Switch; my $srun = "${FindBin::Bin}/srun"; -my ($nprocs, $hostname, $help, $man); +my ($nprocs, $hostname, $verbose, $nostdin, $allstdin, $nostdout, $pernode, + $perif, $no_shem, $gige, $kill_it, $tv, $config_file, $help, $man); + +sub get_new_config() { + + my @file_parts = split(/\//, $config_file); + my $new_config = "/tmp/$file_parts[$#file_parts].slurm"; + my $task_cnt = 0; + my $end_cnt = 0; + + open OLD_FILE, "$config_file" or + die "$config_file doesn't exsist!"; + open FILE, ">$new_config" or + die "Can't open $new_config"; + + foreach my $line (<OLD_FILE>) { + my @parts = split(/\:/, $line); + if(!$parts[0] || !$parts[1] + || ($parts[0] eq "") + || ($parts[1] eq "") + || ($parts[0] =~ '#')) { + next; + } elsif ($parts[0] =~ '\-n *(\d)') { + $end_cnt = $task_cnt+$1-1; + print FILE "$task_cnt-$end_cnt\t$parts[1]"; + $task_cnt = $end_cnt+1; + } else { + print "We don't have support for hostname task layout in a config file right now.\nPlease use srun with the -m arbitrary mode to layout tasks on specific nodes.\n"; + } + } + + close FILE; + close OLD_FILE; + + return ($new_config, $task_cnt); +} + GetOptions('n=i' => \$nprocs, - 'host=s' => \$hostname, + 'host=s' => \$hostname, + 'verbose+' => \$verbose, + 'nostdin' => \$nostdin, + 'allstdin' => \$allstdin, + 'nostdout' => \$nostdout, + 'pernode' => \$pernode, + 'perif' => \$perif, # n/a + 'no-shmem' => \$no_shem, # n/a + 'gige' => \$gige, # n/a + 'kill' => \$kill_it, # n/a + 'tv|totalview' => \$tv, # n/a + 'config=s' => \$config_file, 'help|?' => \$help, 'man' => \$man ) or pod2usage(2); @@ -78,14 +125,43 @@ if ($man) { # Use sole remaining argument as jobIds my $script; if ($ARGV[0]) { - $script = $ARGV[0]; -} else { + foreach (@ARGV) { + $script .= "$_ "; + } +} elsif(!$config_file) { pod2usage(2); } +my $new_config; + + my $command = "$srun"; -$command .= " -n$nprocs" if $nprocs; -$command .= " -w$hostname" if $hostname; +# write stdout and err to files instead of stdout +$command .= " -o job.o\%j -e job.e\%j" if $nostdout; +$command .= " -inone" if $nostdin; +$command .= " -i0" if !$allstdin; #default only send stdin to first node +$command .= " -n$nprocs" if $nprocs; # number of tasks +$command .= " -w$hostname" if $hostname; # Hostlist provided +if($verbose) { + $command .= " -"; # verbose + for(my $i=0; $i<$verbose; $i++) { + $command .= "v"; + } +} + +if($config_file) { + ($new_config, my $new_nprocs) = get_new_config(); + $command .= " -n$new_nprocs" if !$nprocs; + $command .= " --multi-prog $new_config"; +} else { + $command .= " $script"; +} +#print "$command\n"; system($command); + +system("rm -f $new_config") if($new_config); + + + diff --git a/contribs/torque/qsub.pl b/contribs/torque/qsub.pl old mode 100644 new mode 100755 index 9458f0850f4..634a29352a2 --- a/contribs/torque/qsub.pl +++ b/contribs/torque/qsub.pl @@ -42,7 +42,7 @@ use strict; use FindBin; -use Getopt::Long 2.24 qw(:config no_ignore_case); +use Getopt::Long 2.24 qw(:config no_ignore_case require_order); use lib "${FindBin::Bin}/../lib/perl"; use autouse 'Pod::Usage' => qw(pod2usage); use Slurm ':all'; @@ -62,7 +62,7 @@ my ($start_time, $mail_user_list, $job_name, $out_path, -# $priority, + $priority, $destination, # $rerunable, # $script_path, @@ -92,7 +92,7 @@ GetOptions('a=s' => \$start_time, 'M=s' => \$mail_user_list, 'N=s' => \$job_name, 'o=s' => \$out_path, -# 'p=i' => \$priority, + 'p=i' => \$priority, 'q=s' => \$destination, # 'r=s' => \$rerunable, # 'S=s' => \$script_path, @@ -126,7 +126,9 @@ if ($man) { # Use sole remaining argument as jobIds my $script; if ($ARGV[0]) { - $script = $ARGV[0]; + foreach (@ARGV) { + $script .= "$_ "; + } } else { pod2usage(2); } @@ -191,6 +193,7 @@ if($mail_options) { } $command .= " --mail-user=$mail_user_list" if $mail_user_list; $command .= " -J $job_name" if $job_name; +$command .= " --nice=$priority" if $priority; $command .= " -p $destination" if $destination; $command .= " -C $additional_attributes" if $additional_attributes; @@ -228,6 +231,10 @@ sub parse_resource_list { if($opt{cput}) { $opt{cput} = get_minutes($opt{cput}); } + + if($opt{mem}) { + $opt{mem} = convert_mb_format($opt{mem}); + } return \%opt; } @@ -276,7 +283,7 @@ sub parse_node_opts { if($opt{task_cnt}) { $opt{task_cnt} *= $opt{node_cnt}; } - + return \%opt; } @@ -303,10 +310,27 @@ sub get_minutes { } sub convert_mb_format { - my ($amount) = @_; - + my ($value) = @_; + my ($amount, $suffix) = $value =~ /(\d+)($|[KMGT])/i; return if !$amount; + $suffix = lc($suffix); + + if (!$suffix) { + $amount /= 1048576; + } elsif ($suffix eq "k") { + $amount /= 1024; + } elsif ($suffix eq "m") { + #do nothing this is what we want. + } elsif ($suffix eq "g") { + $amount *= 1024; + } elsif ($suffix eq "t") { + $amount *= 1048576; + } else { + print "don't know what to do with suffix $suffix\n"; + return; + } + return $amount; } ############################################################################## diff --git a/doc/html/checkpoint_plugins.shtml b/doc/html/checkpoint_plugins.shtml index 86518ab9b1d..286b631b613 100644 --- a/doc/html/checkpoint_plugins.shtml +++ b/doc/html/checkpoint_plugins.shtml @@ -16,8 +16,9 @@ The major type must be "checkpoint." The minor type can be any recogni abbreviation for the type of checkpoint mechanism. We recommend, for example:</p> <ul> -<li><b>none</b>—No job checkpoint.</li> <li><b>aix</b>—AIX system checkpoint.</li> +<li><b>none</b>—No job checkpoint.</li> +<li><b>ompi</b>—OpenMPI checkpoint (requires OpenMPI version 1.3 or higher).</li> </ul></p> <p>The <span class="commandline">plugin_name</span> and @@ -56,8 +57,8 @@ be stubbed.</p> <p class="commandline">int slurm_ckpt_alloc_job (check_jobinfo_t *jobinfo);</p> <p style="margin-left:.2in"><b>Description</b>: Allocate storage for job-step specific checkpoint data.</p> -<p style="margin-left:.2in"><b>Argument</b>:<span class="commandline"> jobinfo</span> - (output) returns pointer to the allocated storage.</p> +<p style="margin-left:.2in"><b>Argument</b>: +<b>jobinfo</b> (output) returns pointer to the allocated storage.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> @@ -65,8 +66,8 @@ to indicate the reason for failure.</p> <p class="commandline">int slurm_ckpt_free_job (check_jobinfo_t jobinfo);</p> <p style="margin-left:.2in"><b>Description</b>: Release storage for job-step specific checkpoint data that was previously allocated by slurm_ckpt_alloc_job.</p> -<p style="margin-left:.2in"><b>Argument</b>:<span class="commandline"> jobinfo</span> - (input) pointer to the previously allocated storage.</p> +<p style="margin-left:.2in"><b>Argument</b>: +<b>jobinfo</b> (input) pointer to the previously allocated storage.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> @@ -75,10 +76,8 @@ to indicate the reason for failure.</p> <p style="margin-left:.2in"><b>Description</b>: Store job-step specific checkpoint data into a buffer.</p> <p style="margin-left:.2in"><b>Arguments</b>:<br> -<span class="commandline"> jobinfo</span> - (input) pointer to the previously allocated storage.<br> -<span class="commandline">Buf</span> (input/output) buffer to which -jobinfo has been appended.</p> +<b>jobinfo</b> (input) pointer to the previously allocated storage.<br> +<b>Buf</b> (input/output) buffer to which jobinfo has been appended.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> @@ -87,10 +86,8 @@ to indicate the reason for failure.</p> <p style="margin-left:.2in"><b>Description</b>: Retrieve job-step specific checkpoint data from a buffer.</p> <p style="margin-left:.2in"><b>Arguments</b>:</br> -<span class="commandline"> jobinfo</span> - (output) pointer to the previously allocated storage.<br> -<span class="commandline">Buf</span> (input/output) buffer from which -jobinfo has been removed.</p> +<b>jobinfo</b> (output) pointer to the previously allocated storage.<br> +<b>Buf</b> (input/output) buffer from which jobinfo has been removed.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> @@ -101,26 +98,25 @@ uint32_t *error_code, char **error_msg );</p> <p style="margin-left:.2in"><b>Description</b>: Perform some checkpoint operation on a specific job step.</p> <p style="margin-left:.2in"><b>Arguments</b>:<br> -<span class="commandline"> op</span> - (input) specifies the operation to be performed. Currently supported -operations include CHECK_ABLE (is job step currently able to be checkpointed), +<b>op</b> (input) specifies the operation to be performed. +Currently supported operations include +CHECK_ABLE (is job step currently able to be checkpointed), CHECK_DISABLE (disable checkpoints for this job step), CHECK_ENABLE (enable checkpoints for this job step), CHECK_CREATE (create a checkpoint for this job step and continue its execution), CHECK_VACATE (create a checkpoint for this job step and terminate it), CHECK_RESTART (restart this previously checkpointed job step), and CHECK_ERROR (return checkpoint-specific error information for this job step).<br> -<span class="commandline">data</span> (input) operation-specific -data.</br> -<span class="commandline">step_ptr</span> (input/output) identifies -the job step to be operated upon.</br> -<span class="commandline">event_time</span> (output) identifies -the time of a checkpoint or restart operation.</br> -<span class="commandline">error_code</span> (output) returns -checkpoint-specific error code associated with an operation.</br> -<span class="commandline">error_msg</span> (output) identifies -checkpoint-specific error message associated with an operation.</p> -<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, +<b>data</b> (input) operation-specific data.</br> +<b>step_ptr</b> (input/output) identifies the job step to be operated upon.</br> +<b>event_time</b> (output) identifies the time of a checkpoint or restart +operation.</br> +<b>error_code</b> (output) returns checkpoint-specific error code +associated with an operation.</br> +<b>error_msg</b> (output) identifies checkpoint-specific error message +associated with an operation.</p> +<p style="margin-left:.2in"><b>Returns</b>: <br> +SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the error_code and error_msg to an appropriate value to indicate the reason for failure.</p> @@ -128,14 +124,13 @@ appropriate value to indicate the reason for failure.</p> uint32_t error_code, char *error_msg );</p> <p style="margin-left:.2in"><b>Description</b>: Note the completion of a checkpoint operation.</p> <p style="margin-left:.2in"><b>Arguments</b>:<br> -<span class="commandline">step_ptr</span> (input/output) identifies -the job step to be operated upon.</br> -<span class="commandline">event_time</span> (input) identifies -the time that the checkpoint operation began.</br> -<span class="commandline">error_code</span> (input) -checkpoint-specific error code associated with an operation.</br> -<span class="commandline">error_msg</span> (input) -checkpoint-specific error message associated with an operation.</p> +<b>step_ptr</b> (input/output) identifies the job step to be operated upon.</br> +<b>event_time</b> (input) identifies the time that the checkpoint operation +began.</br> +<b>error_code</b> (input) checkpoint-specific error code associated +with an operation.</br> +<b>error_msg</b> (input) checkpoint-specific error message associated +with an operation.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return SLURM_ERROR and set the error_code and error_msg to an appropriate value to indicate the reason for failure.</p> @@ -148,6 +143,6 @@ A checkpoint plugin conveys its ability to implement a particular API version using the mechanism outlined for SLURM plugins.</p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 24 July 2007</p> +<p style="text-align:center;">Last modified 21 August 2007</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index 62ec3b13657..7dd17e55263 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -135,6 +135,7 @@ function displayfile() "ProctrackType=proctrack/" + get_radio_value(document.config.proctrack_type) + "<br>" + "#PluginDir= <br>" + "CacheGroups=" + get_radio_value(document.config.cache_groups) + "<br>" + + "#CheckpointType=checkpoint/none <br>" + "#FirstJobId=1 <br>" + "ReturnToService=" + get_radio_value(document.config.return_to_service) + "<br>" + "#MaxJobCount=2000 <br>" + diff --git a/doc/html/documentation.shtml b/doc/html/documentation.shtml index 41d9a0a4329..a6ca78cd0bd 100644 --- a/doc/html/documentation.shtml +++ b/doc/html/documentation.shtml @@ -40,10 +40,10 @@ Jobs throuh LSF</a></li> <li><a href="api.shtml">Application Programmer Interface (API) Guide</a></li> <li><a href="plugins.shtml">Plugin Programmer Guide</a></li> <li><a href="authplugins.shtml">Authentication Plugin Programmer Guide</a></li> -<li><a href="checkpoint_plugins.shtml">Job Checkpoint Plugin Programmer Guild</a></li> <li><a href="crypto_plugins.shtml">Cryptographic Plugin Programmer Guild</a></li> <li><a href="databaseplugins.shtml">Database Plugin Programmer Guide</a></li> <li><a href="jobacctplugins.shtml">Job Accounting Plugin Programmer Guide</a></li> +<li><a href="checkpoint_plugins.shtml">Job Checkpoint Plugin Programmer Guide</a></li> <li><a href="jobcompplugins.shtml">Job Completion Logging Plugin Programmer Guide</a></li> <li><a href="mpiplugins.shtml">MPI Plugin Programmer Guide</a></li> <li><a href="proctrack_plugins.shtml">Process Tracking Plugin Programmer Guide</a></li> diff --git a/doc/html/maui.shtml b/doc/html/maui.shtml index b362101d439..3331360df00 100644 --- a/doc/html/maui.shtml +++ b/doc/html/maui.shtml @@ -113,7 +113,15 @@ is <b>AuthKey</b>, which should match the key used to configure Maui at build time. Note that SLURM's wiki plugin does not include a mechanism to submit new jobs, so even without this key nobody could -run jobs as another user.</p> +run jobs as another user. +Note that Maui's use of an authentication key with SLURM +is still under development. +If that support is not in place and SLURM is configured +with an <b>AuthKey</b> then communications between Maui +and SLURM will fail and the SlurmctldLog file will contain +errors of this sort: <i>error: wiki: request lacks AUTH=</i>. +If you see this error, remove <b>AuthKey</b> from SLURM's +configuration.</p> <p>Here is a sample <i>wiki.conf</i> file</p> <pre> @@ -127,6 +135,6 @@ AuthKey=42 <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 18 December 2006</p> +<p style="text-align:center;">Last modified 21 August 2007</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/moab.shtml b/doc/html/moab.shtml index 41248a5ea7c..f0d761161f1 100644 --- a/doc/html/moab.shtml +++ b/doc/html/moab.shtml @@ -76,7 +76,15 @@ in the <i>moab.cnf</i> file.</p> <p><b>EHost</b> is the event notification host for Moab. This identifies the computer on which the Moab daemons -executes which should be notified of events.</p> +executes which should be notified of events. +By default EHost will be identical in value to the +ControlAddr configured in slurm.conf.</p> + +<p><b>EHostBackup</b> is the event notification backup host for Moab. +Names the computer on which the backup Moab server executes. +It is used in establishing a communications path for event notification. +By default EHostBackup will be identical in value to the +BackupAddr configured in slurm.conf.</p> <p><b>ExcludePartitions</b> is used to identify partitions whose jobs are to be scheduled directly by SLURM rather @@ -90,9 +98,17 @@ Moab will account for and report the jobs, but their initiation will be outside of Moab's control. Note that Moab controls for resource reservation, fair share scheduling, etc. will not apply to the initiation of these jobs. -If more than one partition is to be scheduled directly be +If more than one partition is to be scheduled directly by Slurm, use a comma separator between their names.</p> +<p><b>HostFormat</b> controls the format of job task lists built +by Slurm and reported to Moab. +The default value is "0", for which each host name is listed +individually, once per processor (e.g. "tux0:tux0:tux1:tux1:..."). +A value of "1" uses Slurm hostlist expressions with processor +counts (e.g. "tux[0-16]*2"). +This is currently experimental. + <p><b>JobAggregationTime</b> is used to avoid notifying Moab of large numbers of events occurring about the same time. If an event occurs within this number of seconds since Moab was @@ -152,6 +168,6 @@ CLIENTCFG[RM:slurm] KEY=123456789 <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 16 August 2007</p> +<p style="text-align:center;">Last modified 17 August 2007</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 450a81c653c..adf326979dd 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1034,6 +1034,13 @@ These environment variables, along with their corresponding options, are listed below. Note: Command line options will always override these settings. .TP 22 +\fBPMI_TIME\fR +This is used exclusively with PMI (MPICH2 and MVAPICH2) and +controls how the much the communications from the tasks to the +srun are spread out in time in order to avoid overwhelming the +srun command with work. The default value is 500 (microseconds) +per task. On relatively slow processors, higher values may be required. +.TP \fBSLURM_CONF\fR The location of the SLURM configuration file. .TP diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 20c05eb0334..19c43787d52 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -80,7 +80,10 @@ The default value is 0 to disable caching group data. Define the system\-initiated checkpoint method to be used for user jobs. The slurmctld daemon must be restarted for a change in \fBCheckpointType\fR to take effect. -Acceptable values at present include "checkpoint/none" and "checkpoint/aix" +Acceptable values at present include +"checkpoint/aix" (only on AIX systems), +"checkpoint/ompi" (requires OpenMPI version 1.3 or higher), and +"checkpoint/none". (only on AIX systems). The default value is "checkpoint/none". diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5 index 4be5273df41..5b6fce9dd6b 100644 --- a/doc/man/man5/wiki.conf.5 +++ b/doc/man/man5/wiki.conf.5 @@ -59,8 +59,24 @@ Moab will account for and report the jobs, but their initiation will be outside of Moab's control. Note that Moab controls for resource reservation, fair share scheduling, etc. will not apply to the initiation of these jobs. -If more than one partition is to be scheduled directly be +If more than one partition is to be scheduled directly by Slurm, use a comma separator between their names. +Not applicable to wiki plugin, only the wiki2 plugin. + +.TP +\fBHostFormat\fR +Controls the format of host lists exchanged between SLURM and Moab. +The default value is "0". +.RS +.TP +\fB0\fR +No data compression. Each host name is listed individually. +.TP +\fB1\fR +SLURM hostlist expressions are exchanged with task counts +(e.g. "tux[0\-16]*2"). +This is currently experimental. +.RE .TP \fBJobAggregationTime\fR @@ -88,20 +104,6 @@ Job permitted to run directly under SLURM's control Hold all incomming jobs until Moab or Maui tell them to run .RE -.TP -\fBHostFormat\fR -Controls the format of host lists exchanged between SLURM and Moab. -The default value is "0". -.RS -.TP -\fB0\fR -No data compression. Each host name is listed individually. -.TP -\fB1\fR -SLURM hostlist expressions are exchanged (e.g. "tux[0\-16]"). -This is currently experimental. -.RE - .SH "EXAMPLE" .LP .br diff --git a/slurm.spec b/slurm.spec index 03fe44831b3..a800c8923bf 100644 --- a/slurm.spec +++ b/slurm.spec @@ -300,6 +300,7 @@ rm -rf $RPM_BUILD_ROOT %dir %{_sysconfdir} %dir %{_libdir}/slurm %{_libdir}/slurm/checkpoint_none.so +%{_libdir}/slurm/checkpoint_ompi.so %{_libdir}/slurm/database_flatfile.so %{_libdir}/slurm/database_mysql.so %{_libdir}/slurm/database_pgsql.so diff --git a/src/api/pmi_server.c b/src/api/pmi_server.c index 91270fbe6e2..94c4f9a7880 100644 --- a/src/api/pmi_server.c +++ b/src/api/pmi_server.c @@ -36,6 +36,7 @@ #include "src/common/macros.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" +#include "src/common/timers.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" #include "src/common/xmalloc.h" @@ -48,6 +49,12 @@ static int kvs_comm_cnt = 0; static int kvs_updated = 0; static struct kvs_comm **kvs_comm_ptr = NULL; +/* Track time to process kvs put requests + * This can be used to tune PMI_TIME environment variable */ +static int min_time_kvs_put = 1000000; +static int max_time_kvs_put = 0; +static int tot_time_kvs_put = 0; + struct barrier_resp { uint16_t port; char *hostname; @@ -95,6 +102,15 @@ static void _kvs_xmit_tasks(void) #if _DEBUG info("All tasks at barrier, transmit KVS keypairs now"); #endif + + /* Target KVS_TIME should be about ave processing time */ + debug("kvs_put processing time min=%d, max=%d ave=%d (usec)", + min_time_kvs_put, max_time_kvs_put, + (tot_time_kvs_put / barrier_cnt)); + min_time_kvs_put = 1000000; + max_time_kvs_put = 0; + tot_time_kvs_put = 0; + /* reset barrier info */ args = xmalloc(sizeof(struct agent_arg)); args->barrier_xmit_ptr = barrier_ptr; @@ -167,9 +183,11 @@ static void *_agent(void *x) int msg_sent = 0, max_forward = 0; pthread_t msg_id; pthread_attr_t attr; + DEF_TIMERS; /* only send one message to each host, * build table of the ports on each host */ + START_TIMER; slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); kvs_set = xmalloc(sizeof(struct kvs_comm_set) * args->barrier_xmit_cnt); @@ -259,6 +277,9 @@ static void *_agent(void *x) } xfree(args->kvs_xmit_ptr); xfree(args); + + END_TIMER; + debug("kvs_xmit time %ld usec", DELTA_TIMER); return NULL; } @@ -356,12 +377,14 @@ static void _print_kvs(void) extern int pmi_kvs_put(struct kvs_comm_set *kvs_set_ptr) { - int i; + int i, usec_timer; struct kvs_comm *kvs_ptr; + DEF_TIMERS; /* Merge new data with old. * NOTE: We just move pointers rather than copy data where * possible for improved performance */ + START_TIMER; pthread_mutex_lock(&kvs_mutex); for (i=0; i<kvs_set_ptr->kvs_comm_recs; i++) { kvs_ptr = _find_kvs_by_name(kvs_set_ptr-> @@ -378,6 +401,12 @@ extern int pmi_kvs_put(struct kvs_comm_set *kvs_set_ptr) _print_kvs(); kvs_updated = 1; pthread_mutex_unlock(&kvs_mutex); + END_TIMER; + usec_timer = DELTA_TIMER; + min_time_kvs_put = MIN(min_time_kvs_put, usec_timer); + max_time_kvs_put = MAX(max_time_kvs_put, usec_timer); + tot_time_kvs_put += usec_timer; + return SLURM_SUCCESS; } diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c index 2e1465aa373..d6403a47cf6 100644 --- a/src/api/slurm_pmi.c +++ b/src/api/slurm_pmi.c @@ -49,14 +49,15 @@ #include "src/common/slurm_auth.h" #define MAX_RETRIES 5 -#define PMI_TIME 500 /* spacing between RPCs, usec */ int pmi_fd = -1; +int pmi_time = 0; uint16_t srun_port = 0; slurm_addr srun_addr; static int _forward_comm_set(struct kvs_comm_set *kvs_set_ptr); static int _get_addr(void); +static void _set_pmi_time(void); static int _get_addr(void) { @@ -75,6 +76,26 @@ static int _get_addr(void) return SLURM_SUCCESS; } +static void _set_pmi_time(void) +{ + char *tmp, *endptr; + + if (pmi_time) + return; + + tmp = getenv("PMI_TIME"); + if (tmp == NULL) { + pmi_time = 500; + return; + } + + pmi_time = strtol(tmp, &endptr, 10); + if ((pmi_time < 0) || (endptr[0] != '\0')) { + error("Invalid PMI_TIME: %s", tmp); + pmi_time = 500; + } +} + /* Transmit PMI Keyval space data */ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, int pmi_rank, int pmi_size) @@ -87,6 +108,7 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, if ((rc = _get_addr()) != SLURM_SUCCESS) return rc; + _set_pmi_time(); slurm_msg_t_init(&msg_send); msg_send.address = srun_addr; @@ -101,7 +123,7 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, * command is very overloaded. * We also increase the timeout (default timeout is * 10 secs). */ - usleep(pmi_rank * PMI_TIME); + usleep(pmi_rank * pmi_time); if (pmi_size > 1000) /* 100 secs */ timeout = slurm_get_msg_timeout() * 10000; else if (pmi_size > 100) /* 50 secs */ @@ -114,7 +136,7 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, error("slurm_send_kvs_comm_set: %m"); return SLURM_ERROR; } - usleep(pmi_rank * PMI_TIME); + usleep(pmi_rank * pmi_time); } return rc; @@ -139,6 +161,9 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, error("_get_addr: %m"); return rc; } + + _set_pmi_time(); + if (pmi_fd < 0) { if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) { error("slurm_init_msg_engine_port: %m"); @@ -177,7 +202,7 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, * command is very overloaded. * We also increase the timeout (default timeout is * 10 secs). */ - usleep(pmi_rank * PMI_TIME); + usleep(pmi_rank * pmi_time); if (pmi_size > 1000) /* 100 secs */ timeout = slurm_get_msg_timeout() * 10000; else if (pmi_size > 100) /* 50 secs */ @@ -190,7 +215,7 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, error("slurm_get_kvs_comm_set: %m"); return SLURM_ERROR; } - usleep(pmi_rank * PMI_TIME); + usleep(pmi_rank * pmi_time); } if (rc != SLURM_SUCCESS) { error("slurm_get_kvs_comm_set error_code=%d", rc); diff --git a/src/common/forward.c b/src/common/forward.c index 58eee32d3cb..f1c8ec408d0 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -83,7 +83,7 @@ void *_forward_thread(void *arg) name); slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, - SLURM_SOCKET_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); @@ -96,7 +96,7 @@ void *_forward_thread(void *arg) slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, - SLURM_SOCKET_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); @@ -172,9 +172,12 @@ void *_forward_thread(void *arg) } if(fwd_msg->header.forward.cnt > 0) { + static int message_timeout = -1; + if (message_timeout < 0) + message_timeout = slurm_get_msg_timeout() * 1000; steps = (fwd_msg->header.forward.cnt+1) / slurm_get_tree_width(); - fwd_msg->timeout = (FORWARD_EXTRA_STEP_WAIT_MS*steps); + fwd_msg->timeout = (message_timeout*steps); steps++; fwd_msg->timeout += (start_timeout*steps); } @@ -234,7 +237,7 @@ void *_forward_thread(void *arg) mark_as_failed_forward( &fwd_msg->ret_list, tmp, - SLURM_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } @@ -242,7 +245,7 @@ void *_forward_thread(void *arg) if(!first_node_found) { mark_as_failed_forward(&fwd_msg->ret_list, name, - SLURM_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; diff --git a/src/common/forward.h b/src/common/forward.h index f6855feb5c5..086d4cdbc8f 100644 --- a/src/common/forward.h +++ b/src/common/forward.h @@ -43,8 +43,6 @@ #include <stdint.h> #include "src/common/slurm_protocol_api.h" -#define FORWARD_EXTRA_STEP_WAIT_MS 5000 - /* * forward_init - initilize forward structure * IN: forward - forward_t * - struct to store forward info diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index e253eb56a5a..44bdc85850c 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -82,6 +82,7 @@ static slurm_protocol_config_t proto_conf_default; static slurm_protocol_config_t *proto_conf = &proto_conf_default; /* static slurm_ctl_conf_t slurmctld_conf; */ +static int message_timeout = -1; /* STATIC FUNCTIONS */ static void _remap_slurmctld_errno(void); @@ -1045,8 +1046,10 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) orig_timeout = timeout; } if(steps) { + if (message_timeout < 0) + message_timeout = slurm_get_msg_timeout() * 1000; orig_timeout = (timeout - - (FORWARD_EXTRA_STEP_WAIT_MS*(steps-1)))/steps; + (message_timeout*(steps-1)))/steps; steps--; } @@ -1292,8 +1295,8 @@ int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, msg->forward_struct->ret_list = msg->ret_list; /* take out the amount of timeout from this hop */ msg->forward_struct->timeout = header.forward.timeout; - if(msg->forward_struct->timeout < 0) - msg->forward_struct->timeout = 0; + if(msg->forward_struct->timeout <= 0) + msg->forward_struct->timeout = message_timeout; msg->forward_struct->fwd_cnt = header.forward.cnt; debug3("forwarding messages to %u nodes with timeout of %d", @@ -1877,13 +1880,14 @@ _send_and_recv_msgs(slurm_fd fd, slurm_msg_t *req, int timeout) if(slurm_send_node_msg(fd, req) >= 0) { if(req->forward.cnt>0) { /* figure out where we are in the tree and set - the timeout for to wait for our childern - correctly - (timeout+FORWARD_EXTRA_STEP_WAIT_MS sec per step) - to let the child timeout */ - + * the timeout for to wait for our childern + * correctly + * (timeout+message_timeout sec per step) + * to let the child timeout */ + if (message_timeout < 0) + message_timeout = slurm_get_msg_timeout() * 1000; steps = (req->forward.cnt+1)/slurm_get_tree_width(); - timeout = (FORWARD_EXTRA_STEP_WAIT_MS*steps); + timeout = (message_timeout*steps); steps++; timeout += (req->forward.timeout*steps); @@ -2127,7 +2131,7 @@ List slurm_send_recv_msgs(const char *nodelist, slurm_msg_t *msg, error("slurm_send_recv_msgs: can't get addr for " "host %s", name); mark_as_failed_forward(&tmp_ret_list, name, - SLURM_SOCKET_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); continue; } @@ -2136,7 +2140,7 @@ List slurm_send_recv_msgs(const char *nodelist, slurm_msg_t *msg, error("slurm_send_recv_msgs to %s: %m", name); mark_as_failed_forward(&tmp_ret_list, name, - SLURM_SOCKET_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); continue; } @@ -2205,7 +2209,7 @@ List slurm_send_addr_recv_msgs(slurm_msg_t *msg, char *name, int timeout) if ((fd = slurm_open_msg_conn(&msg->address)) < 0) { mark_as_failed_forward(&ret_list, name, - SLURM_SOCKET_ERROR); + SLURM_COMMUNICATIONS_CONNECTION_ERROR); return ret_list; } diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 2a26997636a..a370465746b 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -484,6 +484,18 @@ void inline slurm_free_srun_job_complete_msg(srun_job_complete_msg_t * msg) xfree(msg); } +void inline slurm_free_srun_exec_msg(srun_exec_msg_t *msg) +{ + int i; + + if (msg) { + for (i = 0; i < msg->argc; i++) + xfree(msg->argv[i]); + xfree(msg->argv); + xfree(msg); + } +} + void inline slurm_free_srun_ping_msg(srun_ping_msg_t * msg) { xfree(msg); @@ -1255,7 +1267,10 @@ extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data) rc = ((return_code_msg_t *)data)->return_code; break; case RESPONSE_FORWARD_FAILED: - rc = SLURM_ERROR; + /* There may be other reasons for the failure, but + * this may be a slurm_msg_t data type lacking the + * err field found in ret_data_info_t data type */ + rc = SLURM_COMMUNICATIONS_CONNECTION_ERROR; break; default: error("don't know the rc for type %u returning %u", type, rc); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 146f97b0496..2c82c7bff86 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -187,6 +187,7 @@ typedef enum { SRUN_NODE_FAIL, SRUN_JOB_COMPLETE, SRUN_USER_MSG, + SRUN_EXEC, PMI_KVS_PUT_REQ = 7201, PMI_KVS_PUT_RESP, @@ -560,6 +561,13 @@ typedef struct srun_job_complete_msg { uint32_t step_id; /* step_id or NO_VAL */ } srun_job_complete_msg_t; +typedef struct srun_exec_msg { + uint32_t job_id; /* slurm job_id */ + uint32_t step_id; /* step_id or NO_VAL */ + uint16_t argc; /* argument count */ + char ** argv; /* program arguments */ +} srun_exec_msg_t; + typedef struct srun_node_fail_msg { uint32_t job_id; /* slurm job_id */ uint32_t step_id; /* step_id or NO_VAL */ @@ -755,6 +763,7 @@ void inline slurm_free_update_job_time_msg(job_time_msg_t * msg); void inline slurm_free_job_step_kill_msg(job_step_kill_msg_t * msg); void inline slurm_free_epilog_complete_msg(epilog_complete_msg_t * msg); void inline slurm_free_srun_job_complete_msg(srun_job_complete_msg_t * msg); +void inline slurm_free_srun_exec_msg(srun_exec_msg_t *msg); void inline slurm_free_srun_ping_msg(srun_ping_msg_t * msg); void inline slurm_free_srun_node_fail_msg(srun_node_fail_msg_t * msg); void inline slurm_free_srun_timeout_msg(srun_timeout_msg_t * msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 0cf4e5b5ead..0c620dc6f6c 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -272,6 +272,9 @@ static void _pack_job_step_kill_msg(job_step_kill_msg_t * msg, Buf buffer); static int _unpack_job_step_kill_msg(job_step_kill_msg_t ** msg_ptr, Buf buffer); +static void _pack_srun_exec_msg(srun_exec_msg_t * msg, Buf buffer); +static int _unpack_srun_exec_msg(srun_exec_msg_t ** msg_ptr, Buf buffer); + static void _pack_srun_ping_msg(srun_ping_msg_t * msg, Buf buffer); static int _unpack_srun_ping_msg(srun_ping_msg_t ** msg_ptr, Buf buffer); @@ -618,6 +621,9 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) (job_id_response_msg_t *)msg->data, buffer); break; + case SRUN_EXEC: + _pack_srun_exec_msg((srun_exec_msg_t *)msg->data, buffer); + break; case SRUN_JOB_COMPLETE: case SRUN_PING: _pack_srun_ping_msg((srun_ping_msg_t *)msg->data, buffer); @@ -934,6 +940,10 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) (job_id_response_msg_t **) & msg->data, buffer); break; + case SRUN_EXEC: + rc = _unpack_srun_exec_msg((srun_exec_msg_t **) & msg->data, + buffer); + break; case SRUN_JOB_COMPLETE: case SRUN_PING: rc = _unpack_srun_ping_msg((srun_ping_msg_t **) & msg->data, @@ -3704,6 +3714,36 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_srun_exec_msg(srun_exec_msg_t * msg, Buf buffer) +{ + xassert ( msg != NULL ); + + pack32(msg ->job_id , buffer ) ; + pack32(msg ->step_id , buffer ) ; + packstr_array(msg->argv, msg->argc, buffer); +} + +static int +_unpack_srun_exec_msg(srun_exec_msg_t ** msg_ptr, Buf buffer) +{ + srun_exec_msg_t * msg; + xassert ( msg_ptr != NULL ); + + msg = xmalloc ( sizeof (srun_exec_msg_t) ) ; + *msg_ptr = msg; + + safe_unpack32(&msg->job_id , buffer ) ; + safe_unpack32(&msg->step_id , buffer ) ; + safe_unpackstr_array(&msg->argv, &msg->argc, buffer); + return SLURM_SUCCESS; + +unpack_error: + *msg_ptr = NULL; + xfree(msg); + return SLURM_ERROR; +} + static void _pack_srun_ping_msg(srun_ping_msg_t * msg, Buf buffer) { diff --git a/src/plugins/checkpoint/Makefile.am b/src/plugins/checkpoint/Makefile.am index c86f681229c..3ce36725a72 100644 --- a/src/plugins/checkpoint/Makefile.am +++ b/src/plugins/checkpoint/Makefile.am @@ -1,3 +1,3 @@ # Makefile for checkpoint plugins -SUBDIRS = aix none +SUBDIRS = aix none ompi diff --git a/src/plugins/checkpoint/Makefile.in b/src/plugins/checkpoint/Makefile.in index d02575dfcf9..6c06bf0a502 100644 --- a/src/plugins/checkpoint/Makefile.in +++ b/src/plugins/checkpoint/Makefile.in @@ -240,7 +240,7 @@ target_os = @target_os@ target_vendor = @target_vendor@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ -SUBDIRS = aix none +SUBDIRS = aix none ompi all: all-recursive .SUFFIXES: diff --git a/src/plugins/checkpoint/aix/checkpoint_aix.c b/src/plugins/checkpoint/aix/checkpoint_aix.c index 7322a585974..46f21cb5f6b 100644 --- a/src/plugins/checkpoint/aix/checkpoint_aix.c +++ b/src/plugins/checkpoint/aix/checkpoint_aix.c @@ -192,8 +192,11 @@ extern int slurm_ckpt_op ( uint16_t op, uint16_t data, if (check_ptr->disabled) rc = ESLURM_DISABLED; else { - if (check_ptr->reply_cnt < check_ptr->node_cnt) + if ((check_ptr->reply_cnt < check_ptr->node_cnt) + && event_time) { + /* Return time of last event */ *event_time = check_ptr->time_stamp; + } rc = SLURM_SUCCESS; } break; @@ -260,7 +263,9 @@ extern int slurm_ckpt_comp ( struct step_record * step_ptr, time_t event_time, return ESLURM_ALREADY_DONE; if (error_code > check_ptr->error_code) { - info("slurm_ckpt_comp error %u: %s", error_code, error_msg); + info("slurm_ckpt_comp for step %u.%u error %u: %s", + step_ptr->job_ptr->job_id, step_ptr->step_id, + error_code, error_msg); check_ptr->error_code = error_code; xfree(check_ptr->error_msg); check_ptr->error_msg = xstrdup(error_msg); @@ -272,7 +277,7 @@ extern int slurm_ckpt_comp ( struct step_record * step_ptr, time_t event_time, if (check_ptr->reply_cnt++ == check_ptr->node_cnt) { time_t now = time(NULL); long delay = (long) difftime(now, check_ptr->time_stamp); - info("Checkpoint complete for job %u.%u in %ld seconds", + info("slurm_ckpt_comp for step %u.%u in %ld secs", step_ptr->job_ptr->job_id, step_ptr->step_id, delay); check_ptr->time_stamp = now; @@ -382,8 +387,6 @@ static int _step_sig(struct step_record * step_ptr, uint16_t wait, continue; if (check_ptr->node_cnt++ > 0) continue; - check_ptr->time_stamp = time(NULL); - check_ptr->wait_time = wait; _send_sig(step_ptr->job_ptr->job_id, step_ptr->step_id, signal, node_record_table_ptr[i].name, node_record_table_ptr[i].slurm_addr); @@ -399,6 +402,9 @@ static int _step_sig(struct step_record * step_ptr, uint16_t wait, return ESLURM_INVALID_NODE_NAME; } + check_ptr->time_stamp = time(NULL); + check_ptr->wait_time = wait; + info("checkpoint requested for job %u.%u", job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; diff --git a/src/plugins/checkpoint/ompi/Makefile.am b/src/plugins/checkpoint/ompi/Makefile.am new file mode 100644 index 00000000000..b80b0d190d9 --- /dev/null +++ b/src/plugins/checkpoint/ompi/Makefile.am @@ -0,0 +1,13 @@ +# Makefile for checkpoint/ompi plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +pkglib_LTLIBRARIES = checkpoint_ompi.la + +# OpenMPI checkpoint plugin. +checkpoint_ompi_la_SOURCES = checkpoint_ompi.c +checkpoint_ompi_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/checkpoint/ompi/Makefile.in b/src/plugins/checkpoint/ompi/Makefile.in new file mode 100644 index 00000000000..999c5ea1693 --- /dev/null +++ b/src/plugins/checkpoint/ompi/Makefile.in @@ -0,0 +1,549 @@ +# Makefile.in generated by automake 1.10 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Makefile for checkpoint/ompi plugin + +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +subdir = src/plugins/checkpoint/ompi +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/auxdir/acx_pthread.m4 \ + $(top_srcdir)/auxdir/slurm.m4 \ + $(top_srcdir)/auxdir/x_ac__system_configuration.m4 \ + $(top_srcdir)/auxdir/x_ac_affinity.m4 \ + $(top_srcdir)/auxdir/x_ac_aix.m4 \ + $(top_srcdir)/auxdir/x_ac_bluegene.m4 \ + $(top_srcdir)/auxdir/x_ac_databases.m4 \ + $(top_srcdir)/auxdir/x_ac_debug.m4 \ + $(top_srcdir)/auxdir/x_ac_elan.m4 \ + $(top_srcdir)/auxdir/x_ac_federation.m4 \ + $(top_srcdir)/auxdir/x_ac_gpl_licensed.m4 \ + $(top_srcdir)/auxdir/x_ac_gtk.m4 \ + $(top_srcdir)/auxdir/x_ac_munge.m4 \ + $(top_srcdir)/auxdir/x_ac_ncurses.m4 \ + $(top_srcdir)/auxdir/x_ac_pam.m4 \ + $(top_srcdir)/auxdir/x_ac_ptrace.m4 \ + $(top_srcdir)/auxdir/x_ac_readline.m4 \ + $(top_srcdir)/auxdir/x_ac_setpgrp.m4 \ + $(top_srcdir)/auxdir/x_ac_setproctitle.m4 \ + $(top_srcdir)/auxdir/x_ac_sgi_job.m4 \ + $(top_srcdir)/auxdir/x_ac_slurm_ssl.m4 \ + $(top_srcdir)/auxdir/x_ac_xcpu.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h $(top_builddir)/slurm/slurm.h +CONFIG_CLEAN_FILES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = `echo $$p | sed -e 's|^.*/||'`; +am__installdirs = "$(DESTDIR)$(pkglibdir)" +pkglibLTLIBRARIES_INSTALL = $(INSTALL) +LTLIBRARIES = $(pkglib_LTLIBRARIES) +checkpoint_ompi_la_LIBADD = +am_checkpoint_ompi_la_OBJECTS = checkpoint_ompi.lo +checkpoint_ompi_la_OBJECTS = $(am_checkpoint_ompi_la_OBJECTS) +checkpoint_ompi_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(checkpoint_ompi_la_LDFLAGS) $(LDFLAGS) -o $@ +DEFAULT_INCLUDES = -I. -I$(top_builddir) -I$(top_builddir)/slurm@am__isrc@ +depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp +am__depfiles_maybe = depfiles +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +SOURCES = $(checkpoint_ompi_la_SOURCES) +DIST_SOURCES = $(checkpoint_ompi_la_SOURCES) +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AR = @AR@ +AUTHD_CFLAGS = @AUTHD_CFLAGS@ +AUTHD_LIBS = @AUTHD_LIBS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BG_INCLUDES = @BG_INCLUDES@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CMD_LDFLAGS = @CMD_LDFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ELAN_LIBS = @ELAN_LIBS@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FEDERATION_LDFLAGS = @FEDERATION_LDFLAGS@ +FFLAGS = @FFLAGS@ +GREP = @GREP@ +GTK2_CFLAGS = @GTK2_CFLAGS@ +GTK2_LIBS = @GTK2_LIBS@ +HAVEPGCONFIG = @HAVEPGCONFIG@ +HAVEPKGCONFIG = @HAVEPKGCONFIG@ +HAVE_AIX = @HAVE_AIX@ +HAVE_ELAN = @HAVE_ELAN@ +HAVE_FEDERATION = @HAVE_FEDERATION@ +HAVE_OPENSSL = @HAVE_OPENSSL@ +HAVE_SOME_CURSES = @HAVE_SOME_CURSES@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIB_LDFLAGS = @LIB_LDFLAGS@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +MUNGE_CPPFLAGS = @MUNGE_CPPFLAGS@ +MUNGE_LDFLAGS = @MUNGE_LDFLAGS@ +MUNGE_LIBS = @MUNGE_LIBS@ +MYSQL_LIBS = @MYSQL_LIBS@ +NCURSES = @NCURSES@ +NUMA_LIBS = @NUMA_LIBS@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PAM_LIBS = @PAM_LIBS@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PGSQL_CFLAGS = @PGSQL_CFLAGS@ +PGSQL_LIBS = @PGSQL_LIBS@ +PLPA_LIBS = @PLPA_LIBS@ +PROCTRACKDIR = @PROCTRACKDIR@ +PROJECT = @PROJECT@ +PTHREAD_CC = @PTHREAD_CC@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ +PTHREAD_LIBS = @PTHREAD_LIBS@ +RANLIB = @RANLIB@ +READLINE_LIBS = @READLINE_LIBS@ +RELEASE = @RELEASE@ +SEMAPHORE_LIBS = @SEMAPHORE_LIBS@ +SEMAPHORE_SOURCES = @SEMAPHORE_SOURCES@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SLURMCTLD_PORT = @SLURMCTLD_PORT@ +SLURMD_PORT = @SLURMD_PORT@ +SLURM_API_AGE = @SLURM_API_AGE@ +SLURM_API_CURRENT = @SLURM_API_CURRENT@ +SLURM_API_MAJOR = @SLURM_API_MAJOR@ +SLURM_API_REVISION = @SLURM_API_REVISION@ +SLURM_API_VERSION = @SLURM_API_VERSION@ +SLURM_MAJOR = @SLURM_MAJOR@ +SLURM_MICRO = @SLURM_MICRO@ +SLURM_MINOR = @SLURM_MINOR@ +SLURM_VERSION = @SLURM_VERSION@ +SO_LDFLAGS = @SO_LDFLAGS@ +SSL_CPPFLAGS = @SSL_CPPFLAGS@ +SSL_LDFLAGS = @SSL_LDFLAGS@ +SSL_LIBS = @SSL_LIBS@ +STRIP = @STRIP@ +UTIL_LIBS = @UTIL_LIBS@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +AUTOMAKE_OPTIONS = foreign +PLUGIN_FLAGS = -module -avoid-version --export-dynamic +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common +pkglib_LTLIBRARIES = checkpoint_ompi.la + +# OpenMPI checkpoint plugin. +checkpoint_ompi_la_SOURCES = checkpoint_ompi.c +checkpoint_ompi_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) +all: all-am + +.SUFFIXES: +.SUFFIXES: .c .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/plugins/checkpoint/ompi/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/plugins/checkpoint/ompi/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES) + @$(NORMAL_INSTALL) + test -z "$(pkglibdir)" || $(MKDIR_P) "$(DESTDIR)$(pkglibdir)" + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + if test -f $$p; then \ + f=$(am__strip_dir) \ + echo " $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) '$$p' '$(DESTDIR)$(pkglibdir)/$$f'"; \ + $(LIBTOOL) --mode=install $(pkglibLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) "$$p" "$(DESTDIR)$(pkglibdir)/$$f"; \ + else :; fi; \ + done + +uninstall-pkglibLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + p=$(am__strip_dir) \ + echo " $(LIBTOOL) --mode=uninstall rm -f '$(DESTDIR)$(pkglibdir)/$$p'"; \ + $(LIBTOOL) --mode=uninstall rm -f "$(DESTDIR)$(pkglibdir)/$$p"; \ + done + +clean-pkglibLTLIBRARIES: + -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES) + @list='$(pkglib_LTLIBRARIES)'; for p in $$list; do \ + dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ + test "$$dir" != "$$p" || dir=.; \ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +checkpoint_ompi.la: $(checkpoint_ompi_la_OBJECTS) $(checkpoint_ompi_la_DEPENDENCIES) + $(checkpoint_ompi_la_LINK) -rpath $(pkglibdir) $(checkpoint_ompi_la_OBJECTS) $(checkpoint_ompi_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/checkpoint_ompi.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique +tags: TAGS + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique; \ + fi +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: + for dir in "$(DESTDIR)$(pkglibdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-exec-am: install-pkglibLTLIBRARIES + +install-html: install-html-am + +install-info: install-info-am + +install-man: + +install-pdf: install-pdf-am + +install-ps: install-ps-am + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-pkglibLTLIBRARIES + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-pkglibLTLIBRARIES ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-pkglibLTLIBRARIES \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-pkglibLTLIBRARIES + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/plugins/checkpoint/ompi/checkpoint_ompi.c b/src/plugins/checkpoint/ompi/checkpoint_ompi.c new file mode 100644 index 00000000000..7f344cff8f1 --- /dev/null +++ b/src/plugins/checkpoint/ompi/checkpoint_ompi.c @@ -0,0 +1,309 @@ +/*****************************************************************************\ + * checkpoint_ompi.c - OpenMPI slurm checkpoint plugin. + ***************************************************************************** + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <stdio.h> +#include <slurm/slurm.h> +#include <slurm/slurm_errno.h> + +#include "src/common/pack.h" +#include "src/common/xassert.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" +#include "src/slurmctld/slurmctld.h" +#include "src/slurmctld/srun_comm.h" + +struct check_job_info { + uint16_t disabled; /* counter, checkpointable only if zero */ + uint16_t reply_cnt; + uint16_t wait_time; + time_t time_stamp; /* begin or end checkpoint time */ + uint32_t error_code; + char *error_msg; +}; + +static int _ckpt_step(struct step_record * step_ptr, uint16_t wait, int vacate); + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "checkpoint" for SLURM checkpoint) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load checkpoint plugins if the plugin_type string has a + * prefix of "checkpoint/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as the checkpoint API matures. + */ +const char plugin_name[] = "OpenMPI checkpoint plugin"; +const char plugin_type[] = "checkpoint/ompi"; +const uint32_t plugin_version = 90; + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +extern int init ( void ) +{ + /* We can add a pthread here to handle timeout of pending checkpoint + * requests. If a CHECK_VACATE request, we can just abort the job. + * see checkpoint_aix.c for an example of how to do this. */ + return SLURM_SUCCESS; +} + +extern int fini ( void ) +{ + return SLURM_SUCCESS; +} + +/* + * The remainder of this file implements the standard SLURM checkpoint API. + */ + +extern int slurm_ckpt_op ( uint16_t op, uint16_t data, + struct step_record * step_ptr, time_t * event_time, + uint32_t *error_code, char **error_msg ) +{ + int rc = SLURM_SUCCESS; + struct check_job_info *check_ptr; + + xassert(step_ptr); + check_ptr = (struct check_job_info *) step_ptr->check_job; + xassert(check_ptr); + + switch (op) { + case CHECK_ABLE: + if (check_ptr->disabled) + rc = ESLURM_DISABLED; + else { + if ((check_ptr->reply_cnt < 1) && event_time) { + /* Return time of last event */ + *event_time = check_ptr->time_stamp; + } + rc = SLURM_SUCCESS; + } + break; + case CHECK_DISABLE: + check_ptr->disabled++; + break; + case CHECK_ENABLE: + check_ptr->disabled--; + break; + case CHECK_CREATE: + check_ptr->time_stamp = time(NULL); + check_ptr->reply_cnt = 0; + check_ptr->error_code = 0; + xfree(check_ptr->error_msg); + rc = _ckpt_step(step_ptr, data, 0); + break; + case CHECK_VACATE: + check_ptr->time_stamp = time(NULL); + check_ptr->reply_cnt = 0; + check_ptr->error_code = 0; + xfree(check_ptr->error_msg); + rc = _ckpt_step(step_ptr, data, 1); + break; + case CHECK_RESTART: + /* Lots of work is required in Slurm to restart a + * checkpointed job. For now the user can submit a + * new job and execute "ompi_restart <snapshot>" */ + rc = ESLURM_NOT_SUPPORTED; + break; + case CHECK_ERROR: + xassert(error_code); + xassert(error_msg); + *error_code = check_ptr->error_code; + xfree(*error_msg); + *error_msg = xstrdup(check_ptr->error_msg); + break; + default: + error("Invalid checkpoint operation: %d", op); + rc = EINVAL; + } + + return rc; +} + +extern int slurm_ckpt_comp (struct step_record * step_ptr, time_t event_time, + uint32_t error_code, char *error_msg) +{ +/* FIXME: How do we tell when checkpoint completes? + * Add another RPC from srun to slurmctld? + * Where is this called from? */ + struct check_job_info *check_ptr; + time_t now; + long delay; + + xassert(step_ptr); + check_ptr = (struct check_job_info *) step_ptr->check_job; + xassert(check_ptr); + + /* We ignore event_time here, just key off reply_cnt */ + if (check_ptr->reply_cnt) + return ESLURM_ALREADY_DONE; + + if (error_code > check_ptr->error_code) { + info("slurm_ckpt_comp for step %u.%u error %u: %s", + step_ptr->job_ptr->job_id, step_ptr->step_id, + error_code, error_msg); + check_ptr->error_code = error_code; + xfree(check_ptr->error_msg); + check_ptr->error_msg = xstrdup(error_msg); + return SLURM_SUCCESS; + } + + now = time(NULL); + delay = difftime(now, check_ptr->time_stamp); + info("slurm_ckpt_comp for step %u.%u in %ld secs: %s", + step_ptr->job_ptr->job_id, step_ptr->step_id, + delay, error_msg); + check_ptr->error_code = error_code; + xfree(check_ptr->error_msg); + check_ptr->error_msg = xstrdup(error_msg); + check_ptr->reply_cnt++; + check_ptr->time_stamp = now; + + return SLURM_SUCCESS; +} + +extern int slurm_ckpt_alloc_job(check_jobinfo_t *jobinfo) +{ + *jobinfo = (check_jobinfo_t) xmalloc(sizeof(struct check_job_info)); + return SLURM_SUCCESS; +} + +extern int slurm_ckpt_free_job(check_jobinfo_t jobinfo) +{ + xfree(jobinfo); + return SLURM_SUCCESS; +} + +extern int slurm_ckpt_pack_job(check_jobinfo_t jobinfo, Buf buffer) +{ + struct check_job_info *check_ptr = + (struct check_job_info *)jobinfo; + + pack16(check_ptr->disabled, buffer); + pack16(check_ptr->reply_cnt, buffer); + pack16(check_ptr->wait_time, buffer); + + pack32(check_ptr->error_code, buffer); + packstr(check_ptr->error_msg, buffer); + pack_time(check_ptr->time_stamp, buffer); + + return SLURM_SUCCESS; +} + +extern int slurm_ckpt_unpack_job(check_jobinfo_t jobinfo, Buf buffer) +{ + uint16_t uint16_tmp; + struct check_job_info *check_ptr = + (struct check_job_info *)jobinfo; + + safe_unpack16(&check_ptr->disabled, buffer); + safe_unpack16(&check_ptr->reply_cnt, buffer); + safe_unpack16(&check_ptr->wait_time, buffer); + + safe_unpack32(&check_ptr->error_code, buffer); + safe_unpackstr_xmalloc(&check_ptr->error_msg, &uint16_tmp, buffer); + safe_unpack_time(&check_ptr->time_stamp, buffer); + + return SLURM_SUCCESS; + + unpack_error: + xfree(check_ptr->error_msg); + return SLURM_ERROR; +} + +static int _ckpt_step(struct step_record * step_ptr, uint16_t wait, int vacate) +{ + struct check_job_info *check_ptr; + struct job_record *job_ptr; + char *argv[3]; + + xassert(step_ptr); + check_ptr = (struct check_job_info *) step_ptr->check_job; + xassert(check_ptr); + job_ptr = step_ptr->job_ptr; + xassert(job_ptr); + + if (IS_JOB_FINISHED(job_ptr)) + return ESLURM_ALREADY_DONE; + + if (check_ptr->disabled) + return ESLURM_DISABLED; + + argv[0] = "ompi-checkpoint"; + if (vacate) { + argv[1] = "--term"; + argv[2] = NULL; + } else + argv[1] = NULL; + srun_exec(step_ptr, argv); + check_ptr->time_stamp = time(NULL); + check_ptr->wait_time = wait; + info("checkpoint requested for job %u.%u", + job_ptr->job_id, step_ptr->step_id); + return SLURM_SUCCESS; +} diff --git a/src/plugins/mpi/mpichmx/Makefile.in b/src/plugins/mpi/mpichmx/Makefile.in index 4b0feba9c10..4c5196d8e9c 100644 --- a/src/plugins/mpi/mpichmx/Makefile.in +++ b/src/plugins/mpi/mpichmx/Makefile.in @@ -44,6 +44,7 @@ am__aclocal_m4_deps = $(top_srcdir)/auxdir/acx_pthread.m4 \ $(top_srcdir)/auxdir/x_ac_affinity.m4 \ $(top_srcdir)/auxdir/x_ac_aix.m4 \ $(top_srcdir)/auxdir/x_ac_bluegene.m4 \ + $(top_srcdir)/auxdir/x_ac_databases.m4 \ $(top_srcdir)/auxdir/x_ac_debug.m4 \ $(top_srcdir)/auxdir/x_ac_elan.m4 \ $(top_srcdir)/auxdir/x_ac_federation.m4 \ @@ -132,10 +133,12 @@ FFLAGS = @FFLAGS@ GREP = @GREP@ GTK2_CFLAGS = @GTK2_CFLAGS@ GTK2_LIBS = @GTK2_LIBS@ +HAVEPGCONFIG = @HAVEPGCONFIG@ HAVEPKGCONFIG = @HAVEPKGCONFIG@ HAVE_AIX = @HAVE_AIX@ HAVE_ELAN = @HAVE_ELAN@ HAVE_FEDERATION = @HAVE_FEDERATION@ +HAVE_OPENSSL = @HAVE_OPENSSL@ HAVE_SOME_CURSES = @HAVE_SOME_CURSES@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ @@ -155,6 +158,7 @@ MKDIR_P = @MKDIR_P@ MUNGE_CPPFLAGS = @MUNGE_CPPFLAGS@ MUNGE_LDFLAGS = @MUNGE_LDFLAGS@ MUNGE_LIBS = @MUNGE_LIBS@ +MYSQL_LIBS = @MYSQL_LIBS@ NCURSES = @NCURSES@ NUMA_LIBS = @NUMA_LIBS@ OBJEXT = @OBJEXT@ @@ -166,6 +170,8 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_VERSION = @PACKAGE_VERSION@ PAM_LIBS = @PAM_LIBS@ PATH_SEPARATOR = @PATH_SEPARATOR@ +PGSQL_CFLAGS = @PGSQL_CFLAGS@ +PGSQL_LIBS = @PGSQL_LIBS@ PLPA_LIBS = @PLPA_LIBS@ PROCTRACKDIR = @PROCTRACKDIR@ PROJECT = @PROJECT@ @@ -195,6 +201,7 @@ SSL_CPPFLAGS = @SSL_CPPFLAGS@ SSL_LDFLAGS = @SSL_LDFLAGS@ SSL_LIBS = @SSL_LIBS@ STRIP = @STRIP@ +UTIL_LIBS = @UTIL_LIBS@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ diff --git a/src/plugins/sched/wiki2/hostlist.c b/src/plugins/sched/wiki2/hostlist.c index 8c30b5bcefa..bd8d0bfd7c1 100644 --- a/src/plugins/sched/wiki2/hostlist.c +++ b/src/plugins/sched/wiki2/hostlist.c @@ -51,12 +51,15 @@ #include <stdlib.h> #include <string.h> +#include "./msg.h" #include "src/common/hostlist.h" #include "src/common/node_select.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" -#define MOAB_FORMAT1 1 +static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps); +static char * _task_list(struct job_record *job_ptr); +static char * _task_list_exp(struct job_record *job_ptr); /* * Convert Moab supplied TASKLIST expression into a SLURM hostlist expression @@ -133,32 +136,6 @@ extern char * moab2slurm_task_list(char *moab_tasklist, int *task_cnt) return slurm_tasklist; } -#ifndef MOAB_FORMAT1 -/* Append to buf a compact tasklist expression (e.g. "tux[0-1]*2") - * Prepend ":" to expression as needed */ -static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) -{ - int host_str_len = 4096; - char *host_str, rep_str[8]; - - host_str = xmalloc(host_str_len); - hostlist_uniq(*hl_tmp); - while (hostlist_ranged_string(*hl_tmp, host_str_len, host_str) < 0) { - host_str_len *= 2; - xrealloc(*host_str, host_str_len); - } - if (*buf) - xstrcat(*buf, ":"); - xstrcat(*buf, host_str); - snprintf(rep_str, 8, "*%d", *reps); - xstrcat(*buf, rep_str); - xfree(host_str); - hostlist_destroy(*hl_tmp); - *hl_tmp = (hostlist_t) NULL; - *reps = 0; -} -#endif - /* * Report a job's tasks a a MOAB TASKLIST expression * @@ -169,10 +146,15 @@ static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) */ extern char * slurm_job2moab_task_list(struct job_record *job_ptr) { -#ifdef MOAB_FORMAT1 - /* - * Moab format 1: tux0:tux0:tux1:tux1:tux2 - */ + if (use_host_exp) + return _task_list_exp(job_ptr); + else + return _task_list(job_ptr); +} + +/* Return task list in Moab format 1: tux0:tux0:tux1:tux1:tux2 */ +static char * _task_list(struct job_record *job_ptr) +{ int i, j; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); @@ -200,10 +182,35 @@ extern char * slurm_job2moab_task_list(struct job_record *job_ptr) } hostlist_destroy(hl); return buf; -#else - /* - * Moab format 2: tux[0-1]*2:tux2 - */ +} + +/* Append to buf a compact tasklist expression (e.g. "tux[0-1]*2") + * Prepend ":" to expression as needed */ +static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) +{ + int host_str_len = 4096; + char *host_str, rep_str[8]; + + host_str = xmalloc(host_str_len); + hostlist_uniq(*hl_tmp); + while (hostlist_ranged_string(*hl_tmp, host_str_len, host_str) < 0) { + host_str_len *= 2; + xrealloc(*host_str, host_str_len); + } + if (*buf) + xstrcat(*buf, ":"); + xstrcat(*buf, host_str); + snprintf(rep_str, 8, "*%d", *reps); + xstrcat(*buf, rep_str); + xfree(host_str); + hostlist_destroy(*hl_tmp); + *hl_tmp = (hostlist_t) NULL; + *reps = 0; +} + +/* Return task list in Moab format 2: tux[0-1]*2:tux2 */ +static char * _task_list_exp(struct job_record *job_ptr) +{ int i, reps = -1; char *buf = NULL, *host; hostlist_t hl = hostlist_create(job_ptr->nodes); @@ -245,5 +252,4 @@ extern char * slurm_job2moab_task_list(struct job_record *job_ptr) if (hl_tmp) _append_hl_buf(&buf, &hl_tmp, &reps); return buf; -#endif } diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 48923da5a8e..a9f491800d7 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -80,7 +80,7 @@ scontrol_checkpoint(char *op, char *job_step_id_str) sizeof(time_str)); snprintf(buf, sizeof(buf), "Began at %s\n", time_str); - printf(time_str); + printf(buf); } else printf("Yes\n"); } else if (slurm_get_errno() == ESLURM_DISABLED) { diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 534e14f404f..e6e44d17dc2 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -4,7 +4,7 @@ * * $Id$ ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov>, et. al. * Derived from pdsh written by Jim Garlick <garlick1@llnl.gov> @@ -379,6 +379,7 @@ static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr) if ((agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) && (agent_arg_ptr->msg_type != REQUEST_RECONFIGURE) + && (agent_arg_ptr->msg_type != SRUN_EXEC) && (agent_arg_ptr->msg_type != SRUN_TIMEOUT) && (agent_arg_ptr->msg_type != SRUN_NODE_FAIL) && (agent_arg_ptr->msg_type != SRUN_USER_MSG) @@ -495,6 +496,7 @@ static void *_wdog(void *args) ret_data_info_t *ret_data_info = NULL; if ( (agent_ptr->msg_type == SRUN_JOB_COMPLETE) + || (agent_ptr->msg_type == SRUN_EXEC) || (agent_ptr->msg_type == SRUN_PING) || (agent_ptr->msg_type == SRUN_TIMEOUT) || (agent_ptr->msg_type == SRUN_USER_MSG) @@ -582,6 +584,7 @@ static void _notify_slurmctld_jobs(agent_info_t *agent_ptr) job_id = msg->job_id; step_id = NO_VAL; } else if ((agent_ptr->msg_type == SRUN_JOB_COMPLETE) + || (agent_ptr->msg_type == SRUN_EXEC) || (agent_ptr->msg_type == SRUN_USER_MSG)) { return; /* no need to note srun response */ } else if (agent_ptr->msg_type == SRUN_NODE_FAIL) { @@ -785,6 +788,7 @@ static void *_thread_per_group_rpc(void *args) is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) || (msg_type == REQUEST_TERMINATE_JOB) ); srun_agent = ( (msg_type == SRUN_PING) || + (msg_type == SRUN_EXEC) || (msg_type == SRUN_JOB_COMPLETE) || (msg_type == SRUN_TIMEOUT) || (msg_type == SRUN_USER_MSG) || @@ -1298,6 +1302,8 @@ static void _purge_agent_args(agent_arg_t *agent_arg_ptr) slurm_free_kill_job_msg(agent_arg_ptr->msg_args); else if (agent_arg_ptr->msg_type == SRUN_USER_MSG) slurm_free_srun_user_msg(agent_arg_ptr->msg_args); + else if (agent_arg_ptr->msg_type == SRUN_EXEC) + slurm_free_srun_exec_msg(agent_arg_ptr->msg_args); else xfree(agent_arg_ptr->msg_args); } diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c index 3f48278e5f4..c86175fcee9 100644 --- a/src/slurmctld/srun_comm.c +++ b/src/slurmctld/srun_comm.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * srun_comm.c - srun communications ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-226842. @@ -315,7 +315,7 @@ extern void srun_step_complete (struct step_record *step_ptr) if (step_ptr->port && step_ptr->host && step_ptr->host[0]) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, step_ptr->port, step_ptr->host); - msg_arg = xmalloc(sizeof(srun_timeout_msg_t)); + msg_arg = xmalloc(sizeof(srun_job_complete_msg_t)); msg_arg->job_id = step_ptr->job_ptr->job_id; msg_arg->step_id = step_ptr->step_id; _srun_agent_launch(addr, step_ptr->host, SRUN_JOB_COMPLETE, @@ -323,6 +323,40 @@ extern void srun_step_complete (struct step_record *step_ptr) } } +/* + * srun_exec - request that srun execute a specific command + * and route it's output to stdout + * IN step_ptr - pointer to the slurmctld job step record + * IN argv - command and arguments to execute + */ +extern void srun_exec(struct step_record *step_ptr, char **argv) +{ + slurm_addr * addr; + srun_exec_msg_t *msg_arg; + int cnt = 1, i; + + xassert(step_ptr); + + if (step_ptr->port && step_ptr->host && step_ptr->host[0]) { + for (i=0; argv[i]; i++) + cnt++; /* start at 1 to include trailing NULL */ + addr = xmalloc(sizeof(struct sockaddr_in)); + slurm_set_addr(addr, step_ptr->port, step_ptr->host); + msg_arg = xmalloc(sizeof(srun_exec_msg_t)); + msg_arg->job_id = step_ptr->job_ptr->job_id; + msg_arg->step_id = step_ptr->step_id; + msg_arg->argc = cnt; + msg_arg->argv = xmalloc(sizeof(char *) * cnt); + for (i=0; i<cnt ; i++) + msg_arg->argv[i] = xstrdup(argv[i]); + _srun_agent_launch(addr, step_ptr->host, SRUN_EXEC, + msg_arg); + } else { + error("srun_exec %u.%u lacks communication channel", + step_ptr->job_ptr->job_id, step_ptr->step_id); + } +} + /* * srun_response - note that srun has responded * IN job_id - id of job responding diff --git a/src/slurmctld/srun_comm.h b/src/slurmctld/srun_comm.h index 62c83a5370d..f74cdec7ae2 100644 --- a/src/slurmctld/srun_comm.h +++ b/src/slurmctld/srun_comm.h @@ -1,7 +1,7 @@ /*****************************************************************************\ * srun_comm.h - definitions srun communications ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette@llnl.gov> et. al. * UCRL-CODE-226842. @@ -49,6 +49,15 @@ */ extern void srun_allocate (uint32_t job_id); +/* + * srun_exec - request that srun execute a specific command + * and route it's output to stdout + * IN step_ptr - pointer to the slurmctld job step record + * IN argv - command and arguments to execute + */ +extern void srun_exec(struct step_record *step_ptr, char **argv); + + /* * srun_job_complete - notify srun of a job's termination * IN job_ptr - pointer to the slurmctld job record diff --git a/src/slurmd/slurmstepd/step_terminate_monitor.c b/src/slurmd/slurmstepd/step_terminate_monitor.c index a8a02770d49..9b01edd913f 100644 --- a/src/slurmd/slurmstepd/step_terminate_monitor.c +++ b/src/slurmd/slurmstepd/step_terminate_monitor.c @@ -145,6 +145,7 @@ static int call_external_program(void) int status, rc, opt; pid_t cpid; int max_wait = 300; /* seconds */ + int time_remaining; debug("step_terminate_monitor: unkillable after %d sec, calling: %s", timeout, program_name); @@ -189,6 +190,7 @@ static int call_external_program(void) } opt = WNOHANG; + time_remaining = max_wait; while (1) { rc = waitpid(cpid, &status, opt); if (rc < 0) { @@ -200,7 +202,7 @@ static int call_external_program(void) return 0; } else if (rc == 0) { sleep(1); - if ((--max_wait) == 0) { + if ((--time_remaining) == 0) { error("step_terminate_monitor: %s still running" " after %d seconds. Killing.", program_name, max_wait); diff --git a/src/srun/msg.c b/src/srun/msg.c index 42fd89aa09e..ed1f4c1c8a4 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -49,9 +49,12 @@ #include <signal.h> #include <string.h> #include <sys/poll.h> +#include <sys/stat.h> +#include <sys/types.h> #include <sys/wait.h> #include <time.h> #include <stdlib.h> +#include <unistd.h> #include <slurm/slurm_errno.h> @@ -77,6 +80,7 @@ #include "src/srun/allocate.h" #include "src/srun/multi_prog.h" #include "src/srun/signals.h" +#include "src/srun/srun.h" #include "src/common/xstring.h" @@ -93,6 +97,7 @@ static slurm_fd slurmctld_fd = (slurm_fd) NULL; static void _accept_msg_connection(srun_job_t *job, int fdnum); static void _confirm_launch_complete(srun_job_t *job); static void _dump_proctable(srun_job_t *job); +static void _exec_prog(slurm_msg_t *msg); static void _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg); static void _handle_msg(srun_job_t *job, slurm_msg_t *msg); static inline bool _job_msg_done(srun_job_t *job); @@ -306,6 +311,91 @@ _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) return; } +/* This is used to initiate an OpenMPI checkpoint program, + * but is written to be general purpose */ +static void +_exec_prog(slurm_msg_t *msg) +{ + pid_t child; + int pfd[2], status, exit_code = 0, i; + ssize_t len; + char *argv[4], buf[256] = ""; + time_t now = time(NULL); + bool checkpoint = false; + srun_exec_msg_t *exec_msg = msg->data; + + if (exec_msg->argc > 2) { + verbose("Exec '%s %s' for %u.%u", + exec_msg->argv[0], exec_msg->argv[1], + exec_msg->job_id, exec_msg->step_id); + } else { + verbose("Exec '%s' for %u.%u", + exec_msg->argv[0], + exec_msg->job_id, exec_msg->step_id); + } + + if (strcmp(exec_msg->argv[0], "ompi-checkpoint") == 0) + checkpoint = true; + if (checkpoint) { + /* OpenMPI specific checkpoint support */ + info("Checkpoint started at %s", ctime(&now)); + for (i=0; (exec_msg->argv[i] && (i<2)); i++) { + argv[i] = exec_msg->argv[i]; + } + snprintf(buf, sizeof(buf), "%ld", (long) srun_ppid); + argv[i] = buf; + argv[i+1] = NULL; + } + + if (pipe(pfd) == -1) { + snprintf(buf, sizeof(buf), "pipe: %s", strerror(errno)); + error("%s", buf); + exit_code = errno; + goto fini; + } + + child = fork(); + if (child == 0) { + int fd = open("/dev/null", O_RDONLY); + dup2(fd, 0); /* stdin from /dev/null */ + dup2(pfd[1], 1); /* stdout to pipe */ + dup2(pfd[1], 2); /* stderr to pipe */ + close(pfd[0]); + close(pfd[1]); + if (checkpoint) + execvp(exec_msg->argv[0], argv); + else + execvp(exec_msg->argv[0], exec_msg->argv); + error("execvp(%s): %m", exec_msg->argv[0]); + } else if (child < 0) { + snprintf(buf, sizeof(buf), "fork: %s", strerror(errno)); + error("%s", buf); + exit_code = errno; + goto fini; + } else { + close(pfd[1]); + len = read(pfd[0], buf, sizeof(buf)); + close(pfd[0]); + waitpid(child, &status, 0); + exit_code = WEXITSTATUS(status); + } + +fini: if (checkpoint) { + now = time(NULL); + if (exit_code) { + info("Checkpoint completion code %d at %s", + exit_code, ctime(&now)); + } else { + info("Checkpoint completed successfully at %s", + ctime(&now)); + } + if (buf[0]) + info("Checkpoint location: %s", buf); + slurm_checkpoint_complete(exec_msg->job_id, exec_msg->step_id, + time(NULL), (uint32_t) exit_code, buf); + } +} + static void update_running_tasks(srun_job_t *job, uint32_t nodeid) { @@ -575,6 +665,10 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg) slurm_send_rc_msg(msg, SLURM_SUCCESS); slurm_free_srun_ping_msg(msg->data); break; + case SRUN_EXEC: + _exec_prog(msg); + slurm_free_srun_exec_msg(msg->data); + break; case SRUN_JOB_COMPLETE: _job_step_complete(job, msg); slurm_free_srun_job_complete_msg(msg->data); diff --git a/src/srun/srun.c b/src/srun/srun.c index 44e01c4a139..c67019675a3 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -100,6 +100,7 @@ #define TYPE_SCRIPT 2 mpi_plugin_client_info_t mpi_job_info[1]; +pid_t srun_ppid = 0; static struct termios termdefaults; /* @@ -162,7 +163,7 @@ int srun(int ac, char **av) error ("srun initialization failed"); exit (1); } - + srun_ppid = getppid(); /* reinit log with new verbosity (if changed by command line) */ diff --git a/src/srun/srun.h b/src/srun/srun.h index 530b7bebe16..90f9aaf6230 100644 --- a/src/srun/srun.h +++ b/src/srun/srun.h @@ -31,9 +31,14 @@ # include "config.h" #endif +#include <sys/types.h> +#include <unistd.h> + #include "src/api/step_io.h" #include "src/srun/srun_job.h" +extern pid_t srun_ppid; /* required for OpenMPI checkpoint */ + void srun_set_stdio_fds(srun_job_t *job, slurm_step_io_fds_t *cio_fds); #endif /* !_HAVE_SRUN_H */ diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index baea0824292..7c1a17b722d 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -683,7 +683,6 @@ fwd_signal(srun_job_t *job, int signo, int max_threads) error("%s: signal: %s", ret_data_info->node_name, slurm_strerror(rc)); - destroy_data_info(ret_data_info); } } list_iterator_destroy(itr); diff --git a/testsuite/expect/test7.2.prog.c b/testsuite/expect/test7.2.prog.c index 6ed5dd01b7b..9c18cca82a2 100644 --- a/testsuite/expect/test7.2.prog.c +++ b/testsuite/expect/test7.2.prog.c @@ -417,7 +417,7 @@ main (int argc, char **argv) pmi_rank, tv_str); } if (pmi_rank == 0) { - printf("NOTE: All failures, "); + printf("NOTE: All failures reported, "); printf("but only first four successes reported\n"); } exit(0); -- GitLab