diff --git a/doc/bgl.report/Makefile b/doc/bgl.report/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9c58659a7a206f6ee177e614060bec316a3b6685
--- /dev/null
+++ b/doc/bgl.report/Makefile
@@ -0,0 +1,56 @@
+# The following comments are to remind me how the automatic variables work:
+# $@ - target
+# $% - target member
+# $< - First prerequisite
+# $? - All (newer) prerequisites
+# $^ - All prerequisites
+# $+ - $^ but with repetitions
+# $* - $* stem of pattern (for "foo.c" in %.c:%.o this would be "foo")
+# 'info "GNU make"': "Using variables": "Automatic" also lists a few more.
+
+REPORT = report
+
+TEX = ../common/llnlCoverPage.tex $(REPORT).tex 
+
+FIGDIR = ../figures
+FIGS = $(FIGDIR)/arch.eps \
+       $(FIGDIR)/connections.eps \
+       $(FIGDIR)/entities.eps \
+       $(FIGDIR)/interactive-job-init.eps \
+       $(FIGDIR)/slurm-arch.eps
+
+PLOTS = $(FIGDIR)/times.eps 
+
+BIB = ../common/project.bib
+
+%.eps: %.dia
+	dia --nosplash -e $@ $< 
+%.eps: %.gpl
+	gnuplot $<
+%.eps: %.fig
+	fig2dev -Lps $< $@
+%.eps: %.obj
+	tgif -print -eps $<
+%.ps: %.dvi
+	dvips -K -t letter -o $(@F) $(<F)
+%.pdf: %.dvi
+	dvipdf $< $@ 
+
+all: $(REPORT).ps 
+
+
+$(REPORT).dvi: $(TEX) $(FIGS) $(PLOTS) $(BIB)
+	rm -f *.log *.aux *.blg *.bbl
+	(TEXINPUTS=.:../common::; export TEXINPUTS; \
+	 BIBINPUTS=.:../common::; export BIBINPUTS; \
+	 latex $(REPORT); \
+	 bibtex $(REPORT); \
+	 latex $(REPORT); \
+	 latex $(REPORT) )
+
+view: $(REPORT).ps
+	ghostview $(REPORT) &
+
+clean: 
+	rm -f *~ *.dvi *.log *.aux $(REPORT).ps *.blg *.bbl #*.eps #*.gif *.ps
+	      
diff --git a/doc/bgl.report/report.tex b/doc/bgl.report/report.tex
new file mode 100644
index 0000000000000000000000000000000000000000..2deb4deacfb3680009313bcaa16309326ed389b4
--- /dev/null
+++ b/doc/bgl.report/report.tex
@@ -0,0 +1,1105 @@
+% Presenter info: 
+% http://www.linuxclustersinstitute.org/Linux-HPC-Revolution/presenterinfo.html
+%
+% Main Text Layout
+% Set the main text in 10 point Times Roman or Times New Roman (normal), 
+% (no boldface), using single line spacing. All text should be in a single 
+% column and justified. 
+%
+% Opening Style (First Page)
+% This includes the title of the paper, the author names, organization and 
+% country, the abstract, and the first part of the paper.
+% * Start the title 35mm down from the top margin in Times Roman font, 16 
+%   point bold, range left. Capitalize only the first letter of the first 
+%   word and proper nouns.
+% * On a new line, type the authors' names, organizations, and country only 
+%   (not the full postal address, although you may add the name of your 
+%   department), in Times Roman, 11 point italic, range left.
+% * Start the abstract with the heading two lines below the last line of the 
+%   address. Set the abstract in Times Roman, 12 point bold.
+% * Leave one line, then type the abstract in Times Roman 10 point, justified 
+%   with single line spacing.
+%
+% Other Pages
+% For the second and subsequent pages, use the full 190 x 115mm area and type 
+% in one column beginning at the upper right of each page, inserting tables 
+% and figures as required.
+% 
+% We're recommending the Lecture Notes in Computer Science styles from
+% Springer Verlag --- google on Springer Verlag LaTeX.  These work nicely,
+% *except* that it does not work with the hyperref package. Sigh.
+%
+% http://www.springer.de/comp/lncs/authors.html
+%
+% NOTE: This is an excerpt from the document in slurm/doc/pubdesign
+
+\documentclass[10pt,onecolumn,times]{../common/llncs}
+
+\usepackage{verbatim,moreverb}
+\usepackage{float}
+
+% Needed for LLNL cover page / TID bibliography style
+\usepackage{calc}
+\usepackage{epsfig}
+\usepackage{graphics}
+\usepackage{hhline}
+\input{pstricks}
+\input{pst-node}
+\usepackage{chngpage}
+\input{llnlCoverPage}
+
+% Uncomment for DRAFT "watermark"
+\usepackage{draftcopy}
+
+% Times font as default roman
+\renewcommand{\sfdefault}{phv}
+\renewcommand{\rmdefault}{ptm}
+%\renewcommand{\ttdefault}{pcr}
+%\usepackage{times}
+\renewcommand{\labelitemi}{$\bullet$}
+
+\setlength{\textwidth}{115mm}
+\setlength{\textheight}{190mm}
+\setlength{\oddsidemargin}{(\paperwidth-\textwidth)/2 - 1in}
+\setlength{\topmargin}{(\paperheight-\textheight -\headheight-\headsep-\footskip
+)/2 - 1in + .5in }
+
+% couple of macros for the title page and document
+\def\ctit{SLURM Resource Management for Blue Gene/L}
+\def\ucrl{UCRL-JC-TBD}
+\def\auth{Morris Jette \\ Dan Phung \\ Danny Auble}
+\def\pubdate{October 18, 2004}
+\def\journal{Conference TBD}
+
+\begin{document}
+
+% make the cover page
+%\makeLLNLCover{\ucrl}{\ctit}{\auth}{\journal}{\pubdate}{0in}{0in}
+
+% Title - 16pt bold
+\vspace*{35mm}
+\noindent\Large
+\textbf{\ctit}
+\vskip1\baselineskip
+% Authors - 11pt
+\noindent\large
+{Morris Jette, Dan Phung and Danny Auble \\
+{\em Lawrence Livermore National Laboratory, USA}
+\vskip2\baselineskip
+% Abstract heading - 12pt bold
+\noindent\large
+\textbf{Abstract}
+\vskip1\baselineskip
+% Abstract itself - 10pt
+\noindent\normalsize
+The Blue Gene/L (BGL) system is a highly scalable computer developed 
+by IBM for Lawrence Livermore National Laboratory (LLNL). 
+The current system has over 130,000 processors interconnected by a 
+three-dimensional toroidal network with complex rules for managing 
+the network and allocating resources to jobs.
+We selected SLURM (Simple Linux Utility for Resource Management ) to 
+fulfull this role. 
+SLURM is an open source, fault-tolerant, and highly scalable cluster 
+management and job scheduling system in widespread use on Linux clusters.
+This paper presents overviews of BGL resource management issues and
+SLURM architecture.
+It also presents a description of how SLURM provides resource 
+management for BGL and preliminary performance results.
+
+% define some additional macros for the body
+\newcommand{\munged}{{\tt munged}}
+\newcommand{\srun}{{\tt srun}}
+\newcommand{\scancel}{{\tt scancel}}
+\newcommand{\squeue}{{\tt squeue}}
+\newcommand{\scontrol}{{\tt scontrol}}
+\newcommand{\sinfo}{{\tt sinfo}}
+\newcommand{\slurmctld}{{\tt slurmctld}}
+\newcommand{\slurmd}{{\tt slurmd}}
+\newcommand{\smap}{{\tt smap}}
+
+\section{Overview}
+
+The Blue Gene/L system offers a unique cell-based design in which 
+the capacity can be expanded without introducing bottlenecks.
+The Blue Gene/L system delivered to LLNL consists of 
+131,072 processors and 16TB of memory.  
+The peak computational rate will be 360 TeraFLOPs.
+
+Simple Linux Utility for Resource Management (SLURM)\footnote{A tip of 
+the hat to Matt Groening and creators of {\em Futurama},
+where Slurm is the most popular carbonated beverage in the universe.} 
+is a resource management system suitable for use on both small and 
+very large clusters. 
+SLURM was jointly developed by Lawrence Livermore National Laboratory
+(LLNL) and Linux NetworX. 
+It has been deployed on hundreds of Linux clusters world-wide and has 
+proven both highly reliable and highly scalalble.
+
+\section{Architecture of Blue Gene/L}
+
+The basic building-blocks of Blue Gene/L are c-nodes consisting 
+of two processors based upon the PowerPC 550GX, 256MB of memory 
+and support for five separate networks on a single chip. 
+These are subsequently grouped into base partitions, each consisting 
+of 512 c-nodes in an eight by eight by eight grid with the same 
+network support.
+The Blue Gene/L system delivered to LLNL consists of 128 base 
+partitions organized in an eight by four by four grid.
+The minimal resource allocation unit for applications is one 
+base partition so that at most 128 simultaneous jobs may execute. 
+However, we anticipate a substantially smaller number of simultaneous 
+jobs, each using at least a several base partitions.
+
+The c-nodes execute a custom micro-kernel. 
+System calls that can not directly be processed by the c-node 
+micro-kernel are routed to one of the systems I/O nodes. 
+There are 1024 I/O nodes running the Linux operating system, 
+each of which services the requests from 64 c-nodes.
+
+Three distinct communications networks are supported: 
+a three-dimensional torus with direct nearest-neighbor connections;
+a global tree network for broadcast and reduction operations; and 
+a barrier network for synchronization. 
+
+
+Mesh vs. Torus
+Overhead of changes (e.g. reboot nodes)
+Etc.
+
+\section{Architecture of SLURM}
+
+Several SLURM features make it well suited to serve as a resource manager 
+for Blue Gene/L.
+
+\begin{itemize}
+
+\item {\tt Scalability}: 
+The SLURM daemons are multi-threaded with independent read and write 
+locks on the various data structures. 
+SLURM presently manages several Linux clusters with over 1000 nodes 
+and executes full-system parallel jobs on these systems in a few seconds.
+
+\item {\tt Portability}: 
+SLURM is written in the C language, with a GNU {\em autoconf} configuration engine.  
+While initially written for Linux, other Unix-like operating systems including 
+AIX have proven easy porting targets.
+SLURM also supports a general purpose ``plugin'' mechanism, which 
+permits a variety of different infrastructures to be easily supported.
+The SLURM configuration file specifies which set of plugin modules 
+should be used. 
+For example, plugins are used for interfacing with different authentication 
+mechanisms and node interconnects.
+
+\item {\tt Fault Tolerance}: SLURM can handle a variety of failure
+modes without terminating workloads, including crashes of the node
+running the SLURM controller.  User jobs may be configured to continue
+execution despite the failure of one or more nodes on which they are
+executing.  The user command controlling a job, {\tt srun}, may detach
+and reattach from the parallel tasks at any time.  
+
+\item {\tt System Administrator Friendly}: SLURM utilizes
+a simple configuration file and minimizes distributed state.
+Its configuration may be changed at any time without impacting running
+jobs.  SLURM interfaces are usable by scripts and its behavior is 
+highly deterministic.
+
+\end{itemize}
+
+As a cluster resource manager, SLURM has three key functions.  First,
+it allocates exclusive and/or non-exclusive access to resources (compute
+nodes) to users for some duration of time so they can perform work.
+Second, it provides a framework for starting, executing, and monitoring
+work (normally a parallel job) on the set of allocated nodes.  Finally,
+it arbitrates conflicting requests for resources by managing a queue of
+pending work.
+
+Users interact with SLURM through five command line utilities: \srun\
+for submitting a job for execution and optionally controlling it
+interactively, \scancel\ for terminating a pending or running job,
+\squeue\ for monitoring job queues, and \sinfo\ for monitoring partition
+and overall system state.  The \smap\ command combines some of 
+the capabilities of both \sinfo\ and \squeue\, but displays the 
+information in a graphical fashion that reflects the system topography.
+System administrators perform privileged operations through an 
+additional command line utility, {\tt scontrol}.
+
+The central controller daemon, \slurmctld\, maintains the global
+state and directs operations.  Compute nodes simply run a \slurmd\ daemon
+(similar to a remote shell daemon) to export control to SLURM.
+
+\begin{figure}[tb]
+\centerline{\epsfig{file=../figures/arch.eps,scale=0.35}}
+\caption{\small SLURM architecture}
+\label{arch}
+\end{figure}
+
+As shown in Figure~\ref{arch}, SLURM consists of a \slurmd\ daemon
+running on each compute node, a central \slurmctld\ daemon running
+on a management node (with optional fail-over twin), and five command
+line utilities: {\tt srun}, {\tt scancel}, {\tt sinfo}, {\tt squeue},
+and {\tt scontrol}, which can run anywhere in the cluster.
+
+The entities managed by these SLURM daemons include {\em nodes}, the
+compute resource in SLURM, {\em partitions}, which group nodes into
+logical disjoint sets, {\em jobs}, or allocations of resources assigned
+to a user for a specified amount of time, and {\em job steps}, which
+are sets of (possibly parallel) tasks within a job.  Each job in the
+priority-ordered queue is allocated nodes within a single partition.
+Once an allocation request fails, no lower priority jobs for that
+partition will be considered for a resource allocation.  Once a job is
+assigned a set of nodes, the user is able to initiate parallel work in
+the form of job steps in any configuration within the allocation. For
+instance, a single job step may be started that utilizes all nodes
+allocated to the job, or several job steps may independently use a
+portion of the allocation.
+
+\begin{figure}[tcb]
+\centerline{\epsfig{file=../figures/entities.eps,scale=0.5}}
+\caption{\small SLURM entities: nodes, partitions, jobs, and job steps}
+\label{entities}
+\end{figure}
+
+Figure~\ref{entities} further illustrates the interrelation of these
+entities as they are managed by SLURM by showing a group of
+compute nodes split into two partitions. Partition 1 is running one job,
+with one job step utilizing the full allocation of that job.  The job
+in Partition 2 has only one job step using half of the original job
+allocation.  That job might initiate additional job steps to utilize
+the remaining nodes of its allocation.
+
+\begin{figure}[tb]
+\centerline{\epsfig{file=../figures/slurm-arch.eps,scale=0.5}}
+\caption{SLURM architecture - subsystems}
+\label{archdetail}
+\end{figure}
+
+Figure~\ref{archdetail} shows the subsystems that are implemented
+within the \slurmd\ and \slurmctld\ daemons.  These subsystems are
+explained in more detail below.
+
+\subsection{Slurmd}
+
+\slurmd\ is a multi-threaded daemon running on each compute node and
+can be compared to a remote shell daemon: it reads the common SLURM
+configuration file and saved state information, 
+notifies the controller that it is active, waits
+for work, executes the work, returns status, then waits for more work.
+Because it initiates jobs for other users, it must run as user {\em root}.
+It also asynchronously exchanges node and job status with {\tt slurmctld}.
+The only job information it has at any given time pertains to its
+currently executing jobs.  \slurmd\ has five major components:
+
+\begin{itemize}
+\item {\tt Machine and Job Status Services}:  Respond to controller
+requests for machine and job state information and send asynchronous
+reports of some state changes (e.g., \slurmd\ startup) to the controller.
+
+\item {\tt Remote Execution}: Start, manage, and clean up after a set
+of processes (typically belonging to a parallel job) as dictated by
+the \slurmctld\ daemon or an \srun\ or \scancel\ command. Starting a
+process may include executing a prolog program, setting process limits,
+setting real and effective uid, establishing environment variables,
+setting working directory, allocating interconnect resources, setting core
+file paths, initializing stdio, and managing process groups. Terminating
+a process may include terminating all members of a process group and
+executing an epilog program.
+
+\item {\tt Stream Copy Service}: Allow handling of stderr, stdout, and
+stdin of remote tasks. Job input may be redirected 
+from a single file or multiple files (one per task), an
+\srun\ process, or /dev/null.  Job output may be saved into local files or
+returned to the \srun\ command. Regardless of the location of stdout/err,
+all job output is locally buffered to avoid blocking local tasks.
+
+\item {\tt Job Control}: Allow asynchronous interaction with the Remote
+Execution environment by propagating signals or explicit job termination
+requests to any set of locally managed processes.
+
+\end{itemize}
+
+\subsection{Slurmctld}
+
+Most SLURM state information exists in {\tt slurmctld}, also known as
+the controller.  \slurmctld\ is multi-threaded with independent read
+and write locks for the various data structures to enhance scalability.
+When \slurmctld\ starts, it reads the SLURM configuration file and 
+any previously saved state information.  Full controller state
+information is written to disk periodically, with incremental changes
+written to disk immediately for fault tolerance.  \slurmctld\ runs in
+either master or standby mode, depending on the state of its fail-over
+twin, if any.  \slurmctld\ need not execute as user {\em root}.  In fact,
+it is recommended that a unique user entry be created for executing
+\slurmctld\ and that user must be identified in the SLURM configuration
+file as {\tt SlurmUser}.  \slurmctld\ has three major components:
+
+
+\begin{itemize}
+\item {\tt Node Manager}: Monitors the state of each node in the cluster.
+It polls {\tt slurmd}s for status periodically and receives state
+change notifications from \slurmd\ daemons asynchronously.  It ensures
+that nodes have the prescribed configuration before being considered
+available for use.
+
+\item {\tt Partition Manager}: Groups nodes into non-overlapping sets
+called partitions. Each partition can have associated with it
+various job limits and access controls.  The Partition Manager also
+allocates nodes to jobs based on node and partition states and
+configurations. Requests to initiate jobs come from the Job Manager.
+\scontrol\ may be used to administratively alter node and partition
+configurations.
+
+\item {\tt Job Manager}: Accepts user job requests and places pending jobs
+in a priority-ordered queue.  The Job Manager is awakened on a periodic
+basis and whenever there is a change in state that might permit a job to
+begin running, such as job completion, job submission, partition {\em up}
+transition, node {\em up} transition, etc.  The Job Manager then makes
+a pass through the priority-ordered job queue. The highest priority
+jobs for each partition are allocated resources as possible. As soon as
+an allocation failure occurs for any partition, no lower-priority jobs
+for that partition are considered for initiation.  After completing the
+scheduling cycle, the Job Manager's scheduling thread sleeps.  Once a
+job has been allocated resources, the Job Manager transfers necessary
+state information to those nodes, permitting it to commence execution.
+When the Job Manager detects that all nodes associated with a job
+have completed their work, it initiates cleanup and performs another
+scheduling cycle as described above.
+
+\end{itemize}
+
+\subsection{Command Line Utilities}
+
+The command line utilities offer users access to remote execution and 
+job control. They also permit administrators to dynamically change 
+the system configuration. These commands use SLURM APIs that are 
+directly available for more sophisticated applications.
+
+\begin{itemize}
+\item {\tt scancel}: Cancel a running or a pending job or job step,
+subject to authentication and authorization. This command can also be
+used to send an arbitrary signal to all processes on all nodes associated
+with a job or job step.
+
+\item {\tt scontrol}: Perform privileged administrative commands
+such as bringing down a node or partition in preparation for maintenance.
+Many \scontrol\ functions can only be executed by privileged users.
+
+\item {\tt sinfo}: Display a summary of partition and node information.
+An assortment of filtering and output format options are available.
+
+\item {\tt squeue}: Display the queue of running and waiting jobs and/or
+job steps. A wide assortment of filtering, sorting, and output format
+options are available.
+
+\item {\tt srun}: Allocate resources, submit jobs to the SLURM queue,
+and initiate parallel tasks (job steps).  Every set of executing parallel
+tasks has an associated \srun\ that initiated it and, if the \srun\
+persists, manages it.  Jobs may be submitted for later execution
+(e.g., batch), in which case \srun\ terminates after job submission.
+Jobs may also be submitted for interactive execution, where \srun\ keeps
+running to shepherd the running job. In this case, \srun\ negotiates
+connections with remote {\tt slurmd}s for job initiation and to get
+stdout and stderr, forward stdin,\footnote{\srun\ command line options
+select the stdin handling method, such as broadcast to all tasks, or
+send only to task 0.} and respond to signals from the user.  \srun\
+may also be instructed to allocate a set of resources and spawn a shell
+with access to those resources.
+
+\end{itemize}
+
+\subsection{Plugins}
+
+In order to simplify the use of different infrastructures,
+SLURM uses a general purpose plugin mechanism.  A SLURM plugin is a
+dynamically linked code object that is loaded explicitly at run time
+by the SLURM libraries.  A plugin provides a customized implementation
+of a well-defined API connected to tasks such as authentication,
+interconnect fabric, and task scheduling.  A common set of functions is defined
+for use by all of the different infrastructures of a particular variety.
+For example, the authentication plugin must define functions such as 
+{\tt slurm\_auth\_create} to create a credential, {\tt slurm\_auth\_verify}
+to verify a credential to approve or deny authentication, 
+{\tt slurm\_auth\_get\_uid} to get the uid associated with a specific
+credential, etc.  It also must define the data structure used, a plugin
+type, a plugin version number, etc.  When a SLURM daemon is initiated, it
+reads the configuration file to determine which of the available plugins
+should be used.  For example {\em AuthType=auth/authd} says to use the
+plugin for authd based authentication and {\em PluginDir=/usr/local/lib}
+identifies the directory in which to find the plugin.
+
+\subsection{Communications Layer}
+
+SLURM presently uses Berkeley sockets for communications.  However,
+we anticipate using the plugin mechanism to permit use of other
+communications layers.  At LLNL we are using an ethernet network
+for SLURM communications and the Quadrics Elan switch exclusively
+for user applications.  The SLURM configuration file permits the
+identification of each node's hostname as well as its name to be used
+for communications.  In the case of a control machine known as {\em mcri}
+to be communicated with using the name {\em emcri} (say to indicate an
+ethernet communications path), this is represented in the configuration
+file as {\em ControlMachine=mcri ControlAddr=emcri}.  The name used for
+communication is the same as the hostname unless otherwise specified.
+
+Internal SLURM functions pack and unpack data structures in machine
+independent format. We considered the use of XML style messages, but we
+felt this would adversely impact performance (albeit slightly).  If XML
+support is desired, it is straightforward to perform a translation and
+use the SLURM APIs.
+
+
+\subsection{Security}
+
+SLURM has a simple security model: any user of the cluster may submit
+parallel jobs to execute and cancel his own jobs.  Any user may view
+SLURM configuration and state information.  Only privileged users
+may modify the SLURM configuration, cancel any job, or perform other
+restricted activities.  Privileged users in SLURM include the users 
+{\em root} and {\em SlurmUser} (as defined in the SLURM configuration file).
+If permission to modify SLURM configuration is required by others, set-uid
+programs may be used to grant specific permissions to specific users.
+
+\subsubsection{Communication Authentication.}
+
+Historically, inter-node authentication has been accomplished via the use
+of reserved ports and set-uid programs. In this scheme, daemons check the
+source port of a request to ensure that it is less than a certain value
+and thus only accessible by {\em root}. The communications over that
+connection are then implicitly trusted.  Because reserved ports are a 
+limited resource and set-uid programs are a possible security concern,
+we have employed a credential-based authentication scheme that
+does not depend on reserved ports. In this design, a SLURM authentication
+credential is attached to every message and authoritatively verifies the
+uid and gid of the message originator. Once recipients of SLURM messages
+verify the validity of the authentication credential, they can use the uid
+and gid from the credential as the verified identity of the sender.
+
+The actual implementation of the SLURM authentication credential is
+relegated to an ``auth'' plugin.  We presently have implemented three
+functional authentication plugins: authd\cite{Authd2002}, 
+Munge, and none.  The ``none'' authentication type employs a null
+credential and is only suitable for testing and networks where security
+is not a concern. Both the authd and Munge implementations employ
+cryptography to generate a credential for the requesting user that
+may then be authoritatively verified on any remote nodes. However, 
+authd assumes a secure network and Munge does not.  Other authentication
+implementations, such as a credential based on Kerberos, should be easy
+to develop using the auth plugin API.
+
+\subsubsection{Job Authentication.}
+
+When resources are allocated to a user by the controller, a ``job step
+credential'' is generated by combining the uid, job id, step id, the list
+of resources allocated (nodes), and the credential lifetime and signing
+the result with the \slurmctld\ private key.  This credential grants the
+user access to allocated resources and removes the burden from \slurmd\
+to contact the controller to verify requests to run processes. \slurmd\
+verifies the signature on the credential against the controller's public
+key and runs the user's request if the credential is valid.  Part of the
+credential signature is also used to validate stdout, stdin,
+and stderr connections from \slurmd\ to \srun .
+
+\subsubsection{Authorization.}
+
+Access to partitions may be restricted via a {\em RootOnly} flag.
+If this flag is set, job submit or allocation requests to this partition
+are only accepted if the effective uid originating the request is a
+privileged user.  A privileged user may submit a job as any other user.
+This may be used, for example, to provide specific external schedulers
+with exclusive access to partitions.  Individual users will not be
+permitted to directly submit jobs to such a partition, which would
+prevent the external scheduler from effectively managing it.  Access to
+partitions may also be restricted to users who are members of specific
+Unix groups using a {\em AllowGroups} specification.
+
+\section{Slurmctld Design}
+
+\slurmctld\ is modular and multi-threaded with independent read and
+write locks for the various data structures to enhance scalability.
+The controller includes the following subsystems: Node Manager, Partition
+Manager, and Job Manager.  Each of these subsystems is described in
+detail below.
+
+\subsection{Node Management}
+
+The Node Manager monitors the state of nodes.  Node information monitored
+includes:
+
+\begin{itemize}
+\item Count of processors on the node
+\item Size of real memory on the node
+\item Size of temporary disk storage
+\item State of node (RUN, IDLE, DRAINED, etc.)
+\item Weight (preference in being allocated work)
+\item Feature (arbitrary description)
+\item IP address
+\end{itemize}
+
+The SLURM administrator can specify a list of system node names using
+a numeric range in the SLURM configuration file or in the SLURM tools
+(e.g., ``{\em NodeName=linux[001-512] CPUs=4 RealMemory=1024 TmpDisk=4096 \linebreak
+Weight=4 Feature=Linux}'').  These values for CPUs, RealMemory, and
+TmpDisk are considered to be the minimal node configuration values
+acceptable for the node to enter into service.  The \slurmd\ registers
+whatever resources actually exist on the node, and this is recorded
+by the Node Manager.  Actual node resources are checked on \slurmd\
+initialization and periodically thereafter.  If a node registers with
+less resources than configured, it is placed in DOWN state and
+the event logged.  Otherwise, the actual resources reported are recorded
+and possibly used as a basis for scheduling (e.g., if the node has more
+RealMemory than recorded in the configuration file, the actual node
+configuration may be used for determining suitability for any application;
+alternately, the data in the configuration file may be used for possibly
+improved scheduling performance).  Note the node name syntax with numeric
+range permits even very large heterogeneous clusters to be described in
+only a few lines.  In fact, a smaller number of unique configurations
+can provide SLURM with greater efficiency in scheduling work.
+
+{\em Weight} is used to order available nodes in assigning work to
+them.  In a heterogeneous cluster, more capable nodes (e.g., larger memory
+or faster processors) should be assigned a larger weight.  The units
+are arbitrary and should reflect the relative value of each resource.
+Pending jobs are assigned the least capable nodes (i.e., lowest weight)
+that satisfy their requirements.  This tends to leave the more capable
+nodes available for those jobs requiring those capabilities.
+
+{\em Feature} is an arbitrary string describing the node, such as a
+particular software package, file system, or processor speed.  While the
+feature does not have a numeric value, one might include a numeric value
+within the feature name (e.g., ``1200MHz'' or ``16GB\_Swap'').  If the
+nodes on the cluster have disjoint features (e.g., different ``shared''
+file systems), one should identify these as features (e.g., ``FS1'',
+``FS2'', etc.).  Programs may then specify that all nodes allocated to
+it should have the same feature, but that any of the specified features
+are acceptable (e.g., ``$Feature=FS1|FS2|FS3$'' means the job should be
+allocated nodes that all have the feature ``FS1'' or they all have feature
+``FS2,'' etc.).
+
+Node records are kept in an array with hash table lookup.  If nodes are
+given names containing sequence numbers (e.g., ``lx01'', ``lx02'', etc.),
+the hash table permits specific node records to be located very quickly;
+therefore, this is our recommended naming convention for larger clusters.
+
+An API is available to view any of this information and to update some
+node information (e.g., state). APIs designed to return SLURM state
+information permit the specification of a time stamp.  If the requested
+data has not changed since the time stamp specified by the application,
+the application's current information need not be updated.  The API
+returns a brief ``No Change'' response rather than returning relatively
+verbose state information.  Changes in node configurations (e.g., node
+count, memory, etc.) or the nodes actually in the cluster should be
+reflected in the SLURM configuration files. SLURM configuration may be
+updated without disrupting any jobs.
+
+\subsection{Partition Management}
+
+The Partition Manager identifies groups of nodes to be used for execution
+of user jobs. One might consider this the actual resource scheduling
+component.  Data associated with a partition includes:
+
+\begin{itemize}
+\item Name
+\item RootOnly flag to indicate that only users {\em root} or 
+{\tt SlurmUser} may allocate resources in this partition (for any user)
+\item List of associated nodes
+\item State of partition (UP or DOWN)
+\item Maximum time limit for any job
+\item Minimum and maximum nodes allocated to any single job
+\item List of groups permitted to use the partition (defaults to ALL)
+\item Shared access (YES, NO, or FORCE)
+\item Default partition (if no partition is specified in a job request)
+\end{itemize}
+
+It is possible to alter most of this data in real-time in order to affect
+the scheduling of pending jobs (currently executing jobs would not be
+affected).  This information is confined to the controller machine(s)
+for better scalability.  It is used by the Job Manager (and possibly an
+external scheduler), which either exist only on the control machine or
+communicate only with the control machine.
+
+The nodes in a partition may be designated for exclusive or non-exclusive
+use by a job.  A {\tt shared} value of YES indicates that jobs may
+share nodes on request.  A {\tt shared} value of NO indicates that
+jobs are always given exclusive use of allocated nodes.  A {\tt shared}
+value of FORCE indicates that jobs are never ensured exclusive
+access to nodes, but SLURM may initiate multiple jobs on the nodes
+for improved system utilization and responsiveness.  In this case,
+job requests for exclusive node access are not honored.  Non-exclusive
+access may negatively impact the performance of parallel jobs or cause
+them to fail upon exhausting shared resources (e.g., memory or disk
+space). However, shared resources may improve overall system utilization
+and responsiveness. The proper support of shared resources, including
+enforcement of limits on these resources, entails a substantial amount of
+effort, which we are not presently planning to expend.  However, we have
+designed SLURM so as to not preclude the addition of such a capability
+at a later time if so desired.  Future enhancements could include
+constraining jobs to a specific CPU count or memory size within a
+node, which could be used to effectively space-share individual nodes.
+The Partition Manager will allocate nodes to pending jobs on request
+from the Job Manager.
+
+Submitted jobs can specify desired partition, time limit, node count
+(minimum and maximum), CPU count (minimum) task count, the need for
+contiguous node assignment, and an explicit list of nodes to be included
+and/or excluded in its allocation.  Nodes are selected so as to satisfy
+all job requirements.  For example, a job requesting four CPUs and four
+nodes will actually be allocated eight CPUs and four nodes in the case
+of all nodes having two CPUs each.  The request may also indicate node
+configuration constraints such as minimum real memory or CPUs per node,
+required features, shared access, etc.  Overall there are 13 different
+parameters that may identify resource requirements for a job.
+
+Nodes are selected for possible assignment to a job based on the
+job's configuration requirements (e.g., partition specification, minimum
+memory, temporary disk space, features, node list, etc.).  The selection
+is refined by determining which nodes are up and available for use.
+Groups of nodes are then considered in order of weight, with the nodes
+having the lowest {\em Weight} preferred.  Finally, the physical location
+of the nodes is considered.
+
+Bit maps are used to indicate which nodes are up, idle, associated
+with each partition, and associated with each unique configuration.
+This technique permits scheduling decisions to normally be made by
+performing a small number of tests followed by fast bit map manipulations.
+If so configured, a job's resource requirements would be compared with
+the (relatively small number of) node configuration records, each of
+which has an associated bit map. Usable node configuration bitmaps would
+be ANDed with the selected partitions bit map ANDed with the UP node
+bit map and possibly ANDed with the IDLE node bit map (this last test
+depends on the desire to share resources).  This method can eliminate
+tens of thousands of individual node configuration comparisons that 
+would otherwise be required in large heterogeneous clusters.
+
+The actual selection of nodes for allocation to a job is currently tuned
+for the Quadrics interconnect.  This hardware supports hardware message
+broadcast only if the nodes are contiguous.  If a job is not allocated
+contiguous nodes, a slower software based multi-cast mechanism is used.
+Jobs will be allocated continuous nodes to the extent possible (in
+fact, contiguous node allocation may be specified as a requirement on
+job submission).  If contiguous nodes cannot be allocated to a job, it
+will be allocated resources from the minimum number of sets of contiguous
+nodes possible.  If multiple sets of contiguous nodes can be allocated
+to a job, the one that most closely fits the job's requirements will
+be used.  This technique will leave the largest continuous sets of nodes
+intact for jobs requiring them.
+
+The Partition Manager builds a list of nodes to satisfy a job's request.
+It also caches the IP addresses of each node and provides this information
+to \srun\ at job initiation time for improved performance.
+
+The failure of any node to respond to the Partition Manager only affects
+jobs associated with that node.  In fact, a job may indicate it should
+continue executing even if allocated nodes cease responding.  In this
+case, the job needs to provide for its own fault tolerance.  All other
+jobs and nodes in the cluster will continue to operate after a node
+failure.  No additional work is allocated to the failed node, and it will
+be pinged periodically to determine when it resumes responding. The node
+may then be returned to service (depending on the {\tt ReturnToService}
+parameter in the SLURM configuration).
+
+\subsection{Configuration}
+
+A single configuration file applies to all SLURM daemons and commands.
+Most of this information is used only by the controller.  Only the
+host and port information is referenced by most commands.  
+
+\subsection{Job Manager}
+
+There are a multitude of parameters associated with each job, including:
+\begin{itemize}
+\item Job name
+\item Uid
+\item Job id
+\item Working directory
+\item Partition
+\item Priority
+\item Node constraints (processors, memory, features, etc.)
+\end{itemize}
+
+Job records have an associated hash table for rapidly locating 
+specific records. They also have bit maps of requested and/or 
+allocated nodes (as described above).
+
+The core functions supported by the Job Manager include:
+\begin{itemize}
+\item Request resource (job may be queued)
+\item Reset priority of a job
+\item Status job (including node list, memory and CPU use data)
+\item Signal job (send arbitrary signal to all processes associated 
+      with a job)
+\item Terminate job (remove all processes)
+\item Change node count of running job (could fail if insufficient 
+resources are available)
+%\item Preempt/resume job  (future)
+%\item Checkpoint/restart job (future)
+
+\end{itemize}
+
+Jobs are placed in a priority-ordered queue and allocated nodes as
+selected by the Partition Manager.  SLURM implements a very simple default
+scheduling algorithm, namely FIFO.  An attempt is made to schedule pending
+jobs on a periodic basis and whenever any change in job, partition,
+or node state might permit the scheduling of a job.
+
+We are aware that this scheduling algorithm does not satisfy the needs
+of many customers, and we provide the means for establishing other
+scheduling algorithms. Before a newly arrived job is placed into the
+queue, an external scheduler plugin assigns its initial priority.  
+A plugin function is also called at the start of each scheduling
+cycle to modify job or system state as desired.  SLURM APIs permit an
+external entity to alter the priorities of jobs at any time and re-order
+the queue as desired.  The Maui Scheduler \cite{Jackson2001,Maui2002}
+is one example of an external scheduler suitable for use with SLURM.
+
+LLNL uses DPCS \cite{DPCS2002} as SLURM's external scheduler. 
+DPCS is a meta-scheduler with flexible scheduling algorithms that 
+suit our needs well. 
+It also provides the scalability required for this application.
+DPCS maintains pending job state internally and only transfers the 
+jobs to SLURM (or another underlying resources manager) only when 
+they are to begin execution. 
+By not transferring jobs to a particular resources manager earlier, 
+jobs are assured of being initiated on the first resource satisfying 
+their requirements, whether a Linux cluster with SLURM or an IBM SP 
+with LoadLeveler (assuming a highly flexible application).
+This mode of operation may also be suitable for computational grid 
+schedulers.
+
+In a future release, the Job Manager will collect resource consumption
+information (CPU time used, CPU time allocated, and real memory used)
+associated with a job from the \slurmd\ daemons.  Presently, only the
+wall-clock run time of a job is monitored.  When a job approaches its
+time limit (as defined by wall-clock execution time) or an imminent
+system shutdown has been scheduled, the job is terminated.  The actual
+termination process is to notify \slurmd\ daemons on nodes allocated
+to the job of the termination request.  The \slurmd\ job termination
+procedure, including job signaling, is described in Section~\ref{slurmd}.
+
+One may think of a job as described above as an allocation of resources
+rather than a collection of parallel tasks.  The job script executes
+\srun\ commands to initiate the parallel tasks or ``job steps. '' The
+job may include multiple job steps, executing sequentially and/or
+concurrently either on separate or overlapping nodes.  Job steps have
+associated with them specific nodes (some or all of those associated with
+the job), tasks, and a task distribution (cyclic or block) over the nodes.
+
+The management of job steps is considered a component of the Job Manager.
+Supported job step functions include:
+\begin{itemize}
+\item Register job step
+\item Get job step information
+\item Run job step request
+\item Signal job step
+\end{itemize}
+
+Job step information includes a list of nodes (entire set or subset of
+those allocated to the job) and a credential used to bind communications
+between the tasks across the interconnect. The \slurmctld\ constructs
+this credential and sends it to the \srun\ initiating the job step.
+
+\subsection{Fault Tolerance}
+SLURM supports system level fault tolerance through the use of a secondary
+or ``backup'' controller.  The backup controller, if one is configured,
+periodically pings the primary controller.  Should the primary controller
+cease responding, the backup loads state information from the last state
+save and assumes control.  When the primary controller is returned to
+service, it tells the backup controller to save state and terminate.
+The primary then loads state and assumes control.
+
+SLURM utilities and API users read the configuration file and initially try
+to contact the primary controller.  Should that attempt fail, an attempt
+is made to contact the backup controller before returning an error.
+
+SLURM attempts to minimize the amount of time a node is unavailable
+for work.  Nodes assigned to jobs are returned to the partition as
+soon as they successfully clean up user processes and run the system
+epilog.  In this manner,
+those nodes that fail to successfully run the system epilog, or those
+with unkillable user processes, are held out of the partition while
+the remaining nodes are quickly returned to service.
+
+SLURM considers neither the crash of a compute node nor termination
+of \srun\ as a critical event for a job. Users may specify on a per-job
+basis whether the crash of a compute node should result in the premature
+termination of their job. Similarly, if the host on which \srun\ is
+running crashes, the job continues execution and no output is lost.
+
+
+\section{Slurmd Design}\label{slurmd}
+
+The \slurmd\ daemon is a multi-threaded daemon for managing user jobs
+and monitoring system state.  Upon initiation it reads the configuration
+file, recovers any saved state, captures system state, 
+attempts an initial connection to the SLURM
+controller, and awaits requests.  It services requests for system state,
+accounting information, job initiation, job state, job termination,
+and job attachment. On the local node it offers an API to translate
+local process ids into SLURM job id.
+
+The most common action of \slurmd\ is to report system state on request.
+Upon \slurmd\ startup and periodically thereafter, it gathers the
+processor count, real memory size, and temporary disk space for the
+node. Should those values change, the controller is notified.  In a
+future release of SLURM, \slurmd\ will also capture CPU and real-memory and
+virtual-memory consumption from the process table entries for uploading
+to {\tt slurmctld}.
+
+%FUTURE:  Another thread is
+%created to capture CPU, real-memory and virtual-memory consumption from
+%the process table entries.  Differences in resource utilization values
+%from one process table snapshot to the next are accumulated. \slurmd\ 
+%insures these accumulated values are not decremented if resource
+%consumption for a user happens to decrease from snapshot to snapshot,
+%which would simply reflect the termination of one or more processes.
+%Both the real and virtual memory high-water marks are recorded and
+%the integral of memory consumption (e.g. megabyte-hours).  Resource
+%consumption is grouped by uid and SLURM job id (if any). Data
+%is collected for system users ({\em root}, {\em ftp}, {\em ntp}, 
+%etc.) as well as customer accounts. 
+%The intent is to capture all resource use including
+%kernel, idle and down time.  Upon request, the accumulated values are
+%uploaded to \slurmctld\ and cleared.  
+
+\slurmd\ accepts requests from \srun\ and \slurmctld\ to initiate
+and terminate user jobs. The initiate job request contains such
+information as real uid, effective uid, environment variables, working
+directory, task numbers, job step credential, interconnect specifications and
+authorization, core paths, SLURM job id, and the command line to execute.
+System-specific programs can be executed on each allocated node prior
+to the initiation of a user job and after the termination of a user
+job (e.g., {\em Prolog} and {\em Epilog} in the configuration file).
+These programs are executed as user {\em root} and can be used to
+establish an appropriate environment for the user (e.g., permit logins,
+disable logins, terminate orphan processes, etc.).  \slurmd\ executes
+the prolog program, resets its session id, and then initiates the job
+as requested. It records to disk the SLURM job id, session id, process
+id associated with each task, and user associated with the job.  In the
+event of \slurmd\ failure, this information is recovered from disk in
+order to identify active jobs.
+
+When \slurmd\ receives a job termination request from the SLURM
+controller, it sends SIGTERM to all running tasks in the job,
+waits for {\em KillWait} seconds (as specified in the configuration
+file), then sends SIGKILL. If the processes do not terminate \slurmd\
+notifies \slurmctld , which logs the event and sets the node's state
+to DRAINED. After all processes have terminated, \slurmd\ executes the
+configured epilog program, if any.
+
+\section{Command Line Utilities}
+
+\subsection{scancel}
+
+\scancel\ terminates queued jobs or signals running jobs or job steps.
+The default signal is SIGKILL, which indicates a request to terminate
+the specified job or job step.  \scancel\ identifies the job(s) to
+be signaled through user specification of the SLURM job id, job step id,
+user name, partition name, and/or job state.  If a job id is supplied,
+all job steps associated with the job are affected as well as the job
+and its resource allocation.  If a job step id is supplied, only that
+job step is affected.  \scancel\ can only be executed by the job's owner
+or a privileged user.
+
+\subsection{scontrol}
+
+\scontrol\ is a tool meant for SLURM administration by user {\em root}. 
+It provides the following capabilities:
+\begin{itemize}
+\item {\tt Shutdown}: Cause \slurmctld\ and \slurmd\ to save state 
+and terminate.
+\item {\tt Reconfigure}: Cause \slurmctld\ and \slurmd\ to reread the 
+configuration file.
+\item {\tt Ping}: Display the status of primary and backup \slurmctld\ daemons.
+\item {\tt Show Configuration Parameters}: Display the values of general SLURM 
+configuration parameters such as locations of files and values of timers.  
+\item {\tt Show Job State}: Display the state information of a particular job 
+or all jobs in the system.
+\item {\tt Show Job Step State}: Display the state information of a particular 
+job step or all job steps in the system.
+\item {\tt Show Node State}: Display the state and configuration information 
+of a particular node, a set of nodes (using numeric ranges syntax to 
+identify their names), or all nodes.
+\item {\tt Show Partition State}: Display the state and configuration 
+information of a particular partition or all partitions.
+\item {\tt Update Job State}: Update the state information of a particular job 
+in the system. Note that not all state information can be changed in this 
+fashion (e.g., the nodes allocated to a job).
+\item {\tt Update Node State}: Update the state of a particular node. Note
+that not all state information can be changed in this fashion (e.g., the
+amount of memory configured on a node). In some cases, you may need
+to modify the SLURM configuration file and cause it to be reread
+using the ``Reconfigure'' command described above.  
+\item {\tt Update Partition State}: Update the state of a partition
+node. Note that not all state information can be changed in this fashion
+(e.g., the default partition). In some cases, you may need to modify
+the SLURM configuration file and cause it to be reread using the
+``Reconfigure'' command described above.  
+\end{itemize}
+
+\subsection{squeue}
+
+\squeue\ reports the state of SLURM jobs.  It can filter these
+jobs input specification of job state (RUN, PENDING, etc.), job id,
+user name, job name, etc.  If no specification is supplied, the state of
+all pending and running jobs is reported. 
+\squeue\ also has a variety of sorting and output options.
+
+\subsection{sinfo}
+
+\sinfo\ reports the state of SLURM partitions and nodes.  By default,
+it reports a summary of partition state with node counts and a summary
+of the configuration of those nodes.  A variety of sorting and 
+output formatting options exist.
+
+\subsection{srun}
+
+\srun\ is the user interface to accessing resources managed by SLURM.
+Users may utilize \srun\ to allocate resources, submit batch jobs,
+run jobs interactively, attach to currently running jobs, or launch a
+set of parallel tasks (job step) for a running job. \srun\ supports a
+full range of options to specify job constraints and characteristics,
+for example minimum real memory, temporary disk space, and CPUs per node,
+as well as time limits, stdin/stdout/stderr handling, signal handling,
+and working directory for job.  
+
+The \srun\ utility can run in four different modes: {\em interactive},
+in which the \srun\ process remains resident in the user's session,
+manages stdout/stderr/stdin, and forwards signals to the remote tasks;
+{\em batch}, in which \srun\ submits a job script to the SLURM queue for
+later execution; {\em allocate}, in which \srun\ requests resources from
+the SLURM controller and spawns a shell with access to those resources;
+{\em attach}, in which \srun\ attaches to a currently
+running job and displays stdout/stderr in real time from the remote
+tasks.  
+
+\section{Job Initiation Design}
+
+There are three modes in which jobs may be run by users under SLURM. The
+first and most simple mode is {\em interactive} mode, in which stdout and
+stderr are displayed on the user's terminal in real time, and stdin and
+signals may be forwarded from the  terminal transparently to the remote
+tasks. The second mode is {\em batch} or {\em queued} mode, in which the job is
+queued until the request for resources can be satisfied, at which time the
+job is run by SLURM as the submitting user. In the third mode, {\em allocate} 
+mode, a job is allocated to the requesting user, under which the user may
+manually run job steps via a script or in a sub-shell spawned by \srun .
+
+\begin{figure}[tb]
+\centerline{\epsfig{file=../figures/connections.eps,scale=0.35}}
+\caption{\small Job initiation connections overview. 1. \srun\ connects to 
+         \slurmctld\ requesting resources. 2. \slurmctld\ issues a response,
+	 with list of nodes and job step credential. 3. \srun\ opens a listen
+	 port for job IO connections, then sends a run job step
+	 request to \slurmd . 4. \slurmd initiates job step and connects
+	 back to \srun\ for stdout/err }
+\label{connections}
+\end{figure}
+
+Figure~\ref{connections} shows a high-level depiction of the connections
+that occur between SLURM components during a general interactive
+job startup.  \srun\ requests a resource allocation and job step
+initiation from the {\tt slurmctld}, which responds with the job id,
+list of allocated nodes, job step credential, etc.  if the request is granted,
+\srun\ then initializes a listen port for stdio connections and connects
+to the {\tt slurmd}s on the allocated nodes requesting that the remote
+processes be initiated. The {\tt slurmd}s begin execution of the tasks and
+connect back to \srun\ for stdout and stderr. This process is described 
+in more detail below. Details of the batch and allocate modes of operation 
+are not presented due to space constraints.
+
+\subsection{Interactive Job Initiation}
+
+\begin{figure}[tb]
+\centerline{\epsfig{file=../figures/interactive-job-init.eps,scale=0.45} }
+\caption{\small Interactive job initiation. \srun\ simultaneously allocates
+	 nodes and a job step from \slurmctld\ then sends a run request to all
+	 {\tt slurmd}s in job. Dashed arrows indicate a periodic request that
+	 may or may not occur during the lifetime of the job}
+\label{init-interactive}
+\end{figure}
+
+Interactive job initiation is shown in
+Figure~\ref{init-interactive}.  The process begins with a user invoking
+\srun\ in interactive mode.  In Figure~\ref{init-interactive}, the user
+has requested an interactive run of the executable ``{\tt cmd}'' in the
+default partition.
+
+After processing command line options, \srun\ sends a message to
+\slurmctld\ requesting a resource allocation and a job step initiation.
+This message simultaneously requests an allocation (or job) and a job
+step.  \srun\ waits for a reply from {\tt slurmctld}, which may not come
+instantly if the user has requested that \srun\ block until resources are
+available.  When resources are available for the user's job, \slurmctld\
+replies with a job step credential, list of nodes that were allocated,
+cpus per node, and so on. \srun\ then sends a message each \slurmd\ on
+the allocated nodes requesting that a job step be initiated. 
+The \slurmd\ daemons verify that the job is valid using the forwarded job 
+step credential and then respond to \srun .
+
+Each \slurmd\ invokes a job manager process to handle the request, which
+in turn invokes a session manager process that initializes the session for
+the job step. An IO thread is created in the job manager that connects
+all tasks' IO back to a port opened by \srun\ for stdout and stderr.
+Once stdout and stderr have successfully been connected, the task thread
+takes the necessary steps to initiate the user's executable on the node,
+initializing environment, current working directory, and interconnect
+resources if needed.
+
+Each \slurmd\ forks a copy of itself that is responsible for the job
+step on this node. This local job manager process then creates an
+IO thread that initializes stdout, stdin, and stderr streams for each
+local task and connects these streams to the remote \srun . Meanwhile,
+the job manager forks a session manager process that initializes
+the session becomes the requesting user and invokes the user's processes.
+
+As user processes exit, their exit codes are collected, aggregated when
+possible, and sent back to \srun\ in the form of a task exit message.
+Once all tasks have exited, the session manager exits, and the job
+manager process waits for the IO thread to complete, then exits.
+The \srun\ process either waits for all tasks to exit, or attempts to
+clean up the remaining processes some time after the first task exits
+(based on user option).  Regardless, once all tasks are finished,
+\srun\ sends a message to the \slurmctld\ releasing the allocated nodes,
+then exits with an appropriate exit status.
+
+When the \slurmctld\ receives notification that \srun\ no longer needs
+the allocated nodes, it issues a request for the epilog to be run on
+each of the {\tt slurmd}s in the allocation. As {\tt slurmd}s report that the
+epilog ran successfully, the nodes are returned to the partition.
+
+\section{Results}
+
+\begin{figure}[htb]
+\centerline{\epsfig{file=../figures/times.eps,scale=0.7}}
+\caption{\small Time to execute /bin/hostname with various node counts}
+\label{timing}
+\end{figure}
+
+We were able to perform some SLURM tests on a 1000-node cluster 
+in November 2002. Some development was still underway at that time 
+and tuning had not been performed. The results for executing the 
+program {\em /bin/hostname} on two tasks per node and various node 
+counts are shown in Figure~\ref{timing}. We found SLURM performance 
+to be comparable to the
+Quadrics Resource Management System (RMS) \cite{Quadrics2002} for all
+job sizes and about 80 times faster than IBM LoadLeveler\cite{LL2002}
+at tested job sizes.
+
+\section{Future Plans}
+
+\section{Acknowledgments}
+
+SLURM is jointly developed by LLNL and Linux NetworX.
+Contributors to SLURM development include:
+\begin{itemize}
+\item Jay Windley of Linux NetworX for his development of the plugin 
+mechanism and work on the security components
+\item Mark Seager and Greg Tomaschke for their support of this project
+\end{itemize}
+
+\raggedright
+% make the bibliography
+\bibliographystyle{splncs}
+\bibliography{project}
+
+% make the back cover page
+%\makeLLNLBackCover
+\end{document}