From bd4366e8fe35c487b94627f4dbb64917dd2f106e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 9 Jul 2008 18:15:23 +0000
Subject: [PATCH] In salloc, sbatch, and srun replace --task-mem options with
 --mem-per-cpu     (--task-mem will continue to be accepted for now, but is
 not documented).     Replace DefMemPerTask and MaxMemPerTask with
 DefMemPerCPU, DefMemPerNode,     MaxMemPerCPU and MaxMemPerNode in
 slurm.conf. Allocate a job's memory memory     at the same time that
 processors are allocated based upon the --mem or     --mem-per-cpu option
 rather than when job steps are initiated.

---
 NEWS                                          |   6 +
 doc/html/configurator.html.in                 |   4 +-
 doc/html/cons_res.shtml                       | 220 +++--
 doc/html/cons_res_share.shtml                 |  35 +-
 doc/html/gang_scheduling.shtml                | 751 +++++++++---------
 doc/html/preempt.shtml                        |  13 +-
 doc/man/man1/salloc.1                         |  34 +-
 doc/man/man1/sbatch.1                         |  33 +-
 doc/man/man1/srun.1                           |  35 +-
 doc/man/man5/slurm.conf.5                     |  46 +-
 slurm/slurm.h.in                              |   5 +-
 src/api/config_info.c                         |  20 +-
 src/api/init_msg.c                            | 148 ++--
 src/api/job_info.c                            |   9 +-
 src/api/step_ctx.c                            |  14 +-
 src/common/read_config.c                      |  24 +-
 src/common/read_config.h                      |   4 +-
 src/plugins/select/cons_res/select_cons_res.c |  39 +-
 src/plugins/select/linear/select_linear.c     | 119 ++-
 src/salloc/opt.c                              |  39 +-
 src/salloc/opt.h                              |   2 +-
 src/salloc/salloc.c                           |   6 +-
 src/sbatch/opt.c                              |  41 +-
 src/sbatch/opt.h                              |   2 +-
 src/sbatch/sbatch.c                           |   2 +
 src/scontrol/update_job.c                     |  11 +-
 src/slurmctld/job_mgr.c                       |  90 ++-
 src/slurmctld/node_scheduler.c                |  15 +-
 src/slurmctld/slurmctld.h                     |   3 +-
 src/slurmd/slurmd/req.c                       |  38 +-
 src/squeue/print.c                            |   1 +
 src/squeue/sort.c                             |   2 +
 src/srun/allocate.c                           |   6 +-
 src/srun/opt.c                                |  57 +-
 src/srun/opt.h                                |   2 +-
 testsuite/expect/test1.23                     |   2 +-
 testsuite/expect/test15.7                     |   2 +-
 testsuite/expect/test17.10                    |   2 +-
 38 files changed, 1083 insertions(+), 799 deletions(-)

diff --git a/NEWS b/NEWS
index 17c68195dee..262ff7da95a 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,12 @@ documents those changes that are of interest to users and admins.
     getting information for all jobs. Improved performance of some commands. 
     NOTE: Change in RPC means all nodes in the cluster should be updated 
     at the same time.
+ -- In salloc, sbatch, and srun replace --task-mem options with --mem-per-cpu
+    (--task-mem will continue to be accepted for now, but is not documented).
+    Replace DefMemPerTask and MaxMemPerTask with DefMemPerCPU, DefMemPerNode,
+    MaxMemPerCPU and MaxMemPerNode in slurm.conf. Allocate a job's memory memory
+    at the same time that processors are allocated based upon the --mem or
+    --mem-per-cpu option rather than when job steps are initiated.
 
 * Changes in SLURM 1.3.5
 ========================
diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in
index 2b61e0922a9..7fca6c422b7 100644
--- a/doc/html/configurator.html.in
+++ b/doc/html/configurator.html.in
@@ -206,9 +206,9 @@ function displayfile()
    "# <br>" +
    "# <br>" +
    "# SCHEDULING <br>" +
-   "#DefMemPerTask=0 <br>" +
+   "#DefMemPerCPU=0 <br>" +
    "FastSchedule=" + get_radio_value(document.config.fast_schedule) + "<br>" +
-   "#MaxMemPerTask=0 <br>" +
+   "#MaxMemPerCPU=0 <br>" +
    "#SchedulerRootFilter=1 <br>" +
    "#SchedulerTimeSlice=30 <br>" +
    "SchedulerType=sched/" + get_radio_value(document.config.sched_type) + "<br>" +
diff --git a/doc/html/cons_res.shtml b/doc/html/cons_res.shtml
index ae4f0229330..368810a9ebc 100644
--- a/doc/html/cons_res.shtml
+++ b/doc/html/cons_res.shtml
@@ -28,15 +28,15 @@ this plug-in is described below.
     slurm.conf (e.g. <i>SelectType=select/cons_res</i>).</li>
 <pre>
 #
-# "SelectType"			: node selection logic for scheduling.
-#	"select/bluegene"	: the default on BlueGene systems, aware of
-#				  system topology, manages bglblocks, etc.
-#	"select/cons_res"	: allocate individual consumable resources
-#				  (i.e. processors, memory, etc.)
-#	"select/linear"		: the default on non-BlueGene systems,
-#				  no topology awareness, oriented toward
-#				  allocating nodes to jobs rather than
-#				  resources within a node (e.g. CPUs)
+# "SelectType"         : node selection logic for scheduling.
+#    "select/bluegene" : the default on BlueGene systems, aware of
+#                        system topology, manages bglblocks, etc.
+#    "select/cons_res" : allocate individual consumable resources
+#                        (i.e. processors, memory, etc.)
+#    "select/linear"   : the default on non-BlueGene systems,
+#                        no topology awareness, oriented toward
+#                        allocating nodes to jobs rather than
+#                        resources within a node (e.g. CPUs)
 #
 # SelectType=select/linear
 SelectType=select/cons_res
@@ -98,15 +98,15 @@ SelectType=select/cons_res
       SelectTypeParameter in the slurm.conf.</li>
 <pre>
 #
-# "SelectType"			: node selection logic for scheduling.
-#	"select/bluegene"	: the default on BlueGene systems, aware of
-#				  system topology, manages bglblocks, etc.
-#	"select/cons_res"	: allocate individual consumable resources
-#				  (i.e. processors, memory, etc.)
-#	"select/linear"		: the default on non-BlueGene systems,
-#				  no topology awareness, oriented toward
-#				  allocating nodes to jobs rather than
-#				  resources within a node (e.g. CPUs)
+# "SelectType"         : node selection logic for scheduling.
+#    "select/bluegene" : the default on BlueGene systems, aware of
+#                        system topology, manages bglblocks, etc.
+#    "select/cons_res" : allocate individual consumable resources
+#                        (i.e. processors, memory, etc.)
+#    "select/linear"   : the default on non-BlueGene systems,
+#                        no topology awareness, oriented toward
+#                        allocating nodes to jobs rather than
+#                        resources within a node (e.g. CPUs)
 #
 # SelectType=select/linear
 SelectType=select/cons_res
@@ -115,34 +115,33 @@ SelectType=select/cons_res
 #    - select/bluegene - this parameter is currently ignored
 #    - select/linear   - this parameter is currently ignored
 #    - select/cons_res - the parameters available are
-#          - CR_CPU     (1) - CPUs as consumable resources. 
-#                      	No notion of sockets, cores, or threads. 
-#                      	On a multi-core system CPUs will be cores
-#                      	On a multi-core/hyperthread system CPUs will 
-#                      		       be threads
-#                      	On a single-core systems CPUs are CPUs. ;-)
-#          - CR_Socket (2) - Sockets as a consumable resource.
-#          - CR_Core   (3) - Cores as a consumable resource. 
-#				(Not yet implemented)
-#          - CR_Memory (4) - Memory as a consumable resource. 
-#				Note! CR_Memory assumes Shared=Yes
-#          - CR_Socket_Memory (5) - Socket and Memory as consumable 
-#				resources.
-#          - CR_Core_Memory (6) - Core and Memory as consumable 
-#				resources. (Not yet implemented)	
-#          - CR_CPU_Memory (7) - CPU and Memory as consumable 
-#				resources.
+#       - CR_CPU  (1)  - CPUs as consumable resources. 
+#                        No notion of sockets, cores, or threads. 
+#                        On a multi-core system CPUs will be cores
+#                        On a multi-core/hyperthread system CPUs 
+#                                        will be threads
+#                        On a single-core systems CPUs are CPUs.
+#      - CR_Socket (2) - Sockets as a consumable resource.
+#      - CR_Core   (3) - Cores as a consumable resource. 
+#      - CR_Memory (4) - Memory as a consumable resource. 
+#                        Note! CR_Memory assumes Shared=Yes
+#      - CR_Socket_Memory (5) - Socket and Memory as consumable 
+#                               resources.
+#      - CR_Core_Memory (6)   - Core and Memory as consumable 
+#                               resources. (Not yet implemented)
+#      - CR_CPU_Memory (7)    - CPU and Memory as consumable 
+#                               resources.
 #
 # (#) refer to the output of "scontrol show config"
 #
-# NB!:	The -E extension for sockets, cores, and threads 
-#	are ignored within the node allocation mechanism 
-#	when CR_CPU or CR_CPU_MEMORY is selected. 
-#	They are considered to compute the total number of 
-#	tasks when -n is not specified
+# NB!:   The -E extension for sockets, cores, and threads 
+#        are ignored within the node allocation mechanism 
+#        when CR_CPU or CR_CPU_MEMORY is selected. 
+#        They are considered to compute the total number of 
+#        tasks when -n is not specified
 #
 # NB! All CR_s assume Shared=No or Shared=Force EXCEPT for 
-#	CR_MEMORY which assumes Shared=Yes
+#        CR_MEMORY which assumes Shared=Yes
 #
 #SelectTypeParameters=CR_CPU (default)
 </pre>
@@ -169,7 +168,7 @@ SelectType=select/cons_res
     way as when using the default node selection scheme.</li>
   <li>The <i>--exclusive</i> srun switch allows users to request nodes in 
     exclusive mode even when consumable resources is enabled. see "man srun" 
-    for details. </li>	
+    for details. </li>        
   <li>srun's <i>-s</i> or <i>--share</i> is incompatible with the consumable resource 
     environment and will therefore not be honored. Since in this environment nodes 
     are shared by default, <i>--exclusive</i> allows users to obtain dedicated nodes.</li>
@@ -213,19 +212,18 @@ Please send comments and requests about the consumable resources to
 # srun sleep 100 &
 # srun sleep 100 &
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1132  allNodes    sleep   sballe   R       0:05      1 hydra12
-   1133  allNodes    sleep   sballe   R       0:04      1 hydra12
-   1134  allNodes    sleep   sballe   R       0:02      1 hydra12
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1132  allNodes  sleep   sballe   R   0:05      1 hydra12
+ 1133  allNodes  sleep   sballe   R   0:04      1 hydra12
+ 1134  allNodes  sleep   sballe   R   0:02      1 hydra12
 # srun -N 2-2 -E 2:2 sleep 100 &
 srun: job 1135 queued and waiting for resources
 #squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1135  allNodes    sleep   sballe  PD       0:00      2 (Resources)
-   1132  allNodes    sleep   sballe   R       0:24      1 hydra12
-   1133  allNodes    sleep   sballe   R       0:23      1 hydra12
-   1134  allNodes    sleep   sballe   R       0:21      1 hydra12
-#
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1135  allNodes  sleep   sballe  PD   0:00      2 (Resources)
+ 1132  allNodes  sleep   sballe   R   0:24      1 hydra12
+ 1133  allNodes  sleep   sballe   R   0:23      1 hydra12
+ 1134  allNodes  sleep   sballe   R   0:21      1 hydra12
 </pre>
     <li><b>Proposed solution:</b> Enhance the selection mechanism to go through {node,socket,core,thread}-tuplets to find available match for specific request (bounded knapsack problem). </li>
     </ul>
@@ -248,7 +246,7 @@ srun: job 1135 queued and waiting for resources
 <h2>Examples of CR_Memory, CR_Socket_Memory, and CR_CPU_Memory type consumable resources</h2> 
 
 <pre>
-sinfo -lNe
+# sinfo -lNe
 NODELIST     NODES PARTITION  STATE  CPUS  S:C:T MEMORY 
 hydra[12-16]     5 allNodes*  ...       4  2:2:1   2007 
 </pre>
@@ -256,59 +254,59 @@ hydra[12-16]     5 allNodes*  ...       4  2:2:1   2007
 <p>Using select/cons_res plug-in with CR_Memory</p>
 <pre>
 Example:
-srun -N 5 -n 20 --job-mem=1000 sleep 100 &  <-- running
-srun -N 5 -n 20 --job-mem=10 sleep 100 &    <-- running 
-srun -N 5 -n 10 --job-mem=1000 sleep 100 &  <-- queued and waiting for resources
-
-squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1820  allNodes    sleep   sballe  PD       0:00      5 (Resources)
-   1818  allNodes    sleep   sballe   R       0:17      5 hydra[12-16]
-   1819  allNodes    sleep   sballe   R       0:11      5 hydra[12-16]
+# srun -N 5 -n 20 --job-mem=1000 sleep 100 &  <-- running
+# srun -N 5 -n 20 --job-mem=10 sleep 100 &    <-- running 
+# srun -N 5 -n 10 --job-mem=1000 sleep 100 &  <-- queued and waiting for resources
+
+# squeue
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1820  allNodes  sleep   sballe  PD   0:00      5 (Resources)
+ 1818  allNodes  sleep   sballe   R   0:17      5 hydra[12-16]
+ 1819  allNodes  sleep   sballe   R   0:11      5 hydra[12-16]
 </pre>
 
 <p>Using select/cons_res plug-in with CR_Socket_Memory (2 sockets/node)</p>
 <pre>
 Example 1:
-srun -N 5 -n 5 --job-mem=1000 sleep 100 &        <-- running
-srun -n 1 -w hydra12 --job-mem=2000 sleep 100 &  <-- queued and waiting for resources
+# srun -N 5 -n 5 --job-mem=1000 sleep 100 &        <-- running
+# srun -n 1 -w hydra12 --job-mem=2000 sleep 100 &  <-- queued and waiting for resources
 
-squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1890  allNodes    sleep   sballe  PD       0:00      1 (Resources)
-   1889  allNodes    sleep   sballe   R       0:08      5 hydra[12-16]
+# squeue
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1890  allNodes  sleep   sballe  PD   0:00      1 (Resources)
+ 1889  allNodes  sleep   sballe   R   0:08      5 hydra[12-16]
 
 Example 2:
-srun -N 5 -n 10 --job-mem=10 sleep 100 & <-- running 
-srun -n 1 --job-mem=10 sleep 100 & <-- queued and waiting for resourcessqueue
+# srun -N 5 -n 10 --job-mem=10 sleep 100 & <-- running 
+# srun -n 1 --job-mem=10 sleep 100 & <-- queued and waiting for resourcessqueue
 
-squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1831  allNodes    sleep   sballe  PD       0:00      1 (Resources)
-   1830  allNodes    sleep   sballe   R       0:07      5 hydra[12-16]
+# squeue
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1831  allNodes  sleep   sballe  PD   0:00      1 (Resources)
+ 1830  allNodes  sleep   sballe   R   0:07      5 hydra[12-16]
 </pre>
 
 <p>Using select/cons_res plug-in with CR_CPU_Memory (4 CPUs/node)</p>
 <pre>
 Example 1:
-srun -N 5 -n 5 --job-mem=1000 sleep 100 &  <-- running 
-srun -N 5 -n 5 --job-mem=10 sleep 100 &    <-- running
-srun -N 5 -n 5 --job-mem=1000 sleep 100 &  <-- queued and waiting for resources
+# srun -N 5 -n 5 --job-mem=1000 sleep 100 &  <-- running 
+# srun -N 5 -n 5 --job-mem=10 sleep 100 &    <-- running
+# srun -N 5 -n 5 --job-mem=1000 sleep 100 &  <-- queued and waiting for resources
 
-squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1835  allNodes    sleep   sballe  PD       0:00      5 (Resources)
-   1833  allNodes    sleep   sballe   R       0:10      5 hydra[12-16]
-   1834  allNodes    sleep   sballe   R       0:07      5 hydra[12-16]
+# squeue
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1835  allNodes  sleep   sballe  PD   0:00      5 (Resources)
+ 1833  allNodes  sleep   sballe   R   0:10      5 hydra[12-16]
+ 1834  allNodes  sleep   sballe   R   0:07      5 hydra[12-16]
 
 Example 2:
-srun -N 5 -n 20 --job-mem=10 sleep 100 & <-- running 
-srun -n 1 --job-mem=10 sleep 100 &       <-- queued and waiting for resources
+# srun -N 5 -n 20 --job-mem=10 sleep 100 & <-- running 
+# srun -n 1 --job-mem=10 sleep 100 &       <-- queued and waiting for resources
 
-squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-   1837  allNodes    sleep   sballe  PD       0:00      1 (Resources)
-   1836  allNodes    sleep   sballe   R       0:11      5 hydra[12-16]
+# squeue
+JOBID PARTITION   NAME     USER  ST   TIME  NODES NODELIST(REASON)
+ 1837  allNodes  sleep   sballe  PD   0:00      1 (Resources)
+ 1836  allNodes  sleep   sballe   R   0:11      5 hydra[12-16]
 </pre>
 
 <p class="footer"><a href="#top">top</a></p>
@@ -365,11 +363,11 @@ have one idle cpu and linux04 has 3 idle cpus.</p>
 
 <pre>
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-      3       lsf    sleep     root  PD       0:00      3 (Resources)
-      4       lsf    sleep     root  PD       0:00      1 (Resources)
-      5       lsf    sleep     root  PD       0:00      1 (Resources)
-      2       lsf    sleep     root   R       0:14      4 xc14n[13-16]
+JOBID PARTITION   NAME   USER  ST   TIME  NODES NODELIST(REASON)
+    3       lsf  sleep   root  PD   0:00      3 (Resources)
+    4       lsf  sleep   root  PD   0:00      1 (Resources)
+    5       lsf  sleep   root  PD   0:00      1 (Resources)
+    2       lsf  sleep   root   R   0:14      4 xc14n[13-16]
 </pre>
 
 <p>Once Job 2 is finished, Job 3 is scheduled and runs on
@@ -381,10 +379,10 @@ and Job 4 can run concurrently on the cluster.</p>
 
 <pre>
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-      5       lsf    sleep     root  PD       0:00      1 (Resources)
-      3       lsf    sleep     root   R       0:11      3 xc14n[13-15]
-      4       lsf    sleep     root   R       0:11      1 xc14n16
+JOBID PARTITION   NAME   USER  ST   TIME  NODES NODELIST(REASON)
+    5       lsf  sleep   root  PD   0:00      1 (Resources)
+    3       lsf  sleep   root   R   0:11      3 xc14n[13-15]
+    4       lsf  sleep   root   R   0:11      1 xc14n16
 </pre>
 
 <p>Once Job 3 finishes, Job 5 is allocated resources and can run.</p>
@@ -426,16 +424,16 @@ nodes) and Job 4 is scheduled onto one of the remaining idle cpus on Linux04.</p
 <pre>
 
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-      5       lsf    sleep     root  PD       0:00      1 (Resources)
-      2       lsf    sleep     root   R       0:13      4 linux[01-04]
-      3       lsf    sleep     root   R       0:09      3 linux[01-03]
-      4       lsf    sleep     root   R       0:05      1 linux04
+JOBID PARTITION   NAME   USER  ST   TIME  NODES NODELIST(REASON)
+    5       lsf  sleep   root  PD   0:00      1 (Resources)
+    2       lsf  sleep   root   R   0:13      4 linux[01-04]
+    3       lsf  sleep   root   R   0:09      3 linux[01-03]
+    4       lsf  sleep   root   R   0:05      1 linux04
 
 # sinfo -lNe
 NODELIST     NODES PARTITION       STATE CPUS MEMORY TMP_DISK WEIGHT FEATURES REASON
-linux[01-03]    3      lsf*   allocated    2   2981        1      1   (null) none
-linux04         1      lsf*   allocated    4   3813        1      1   (null) none
+linux[01-03]     3      lsf*   allocated    2   2981        1      1   (null) none
+linux04          1      lsf*   allocated    4   3813        1      1   (null) none
 </pre>
 
 <p>Once Job 2 finishes, Job 5, which was pending, is allocated available resources and is then
@@ -443,10 +441,10 @@ running as illustrated below:</p>
 
 <pre>
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-      3       lsf    sleep     root   R       1:58      3 linux[01-03]
-      4       lsf    sleep     root   R       1:54      1 linux04
-      5       lsf    sleep     root   R       0:02      3 linux[01-03]
+JOBID PARTITION   NAME   USER  ST   TIME  NODES NODELIST(REASON)
+   3       lsf   sleep   root   R   1:58      3 linux[01-03]
+   4       lsf   sleep   root   R   1:54      1 linux04
+   5       lsf   sleep   root   R   0:02      3 linux[01-03]
 # sinfo -lNe
 NODELIST     NODES PARTITION       STATE CPUS MEMORY TMP_DISK WEIGHT FEATURES REASON
 linux[01-03]     3      lsf*   allocated    2   2981        1      1   (null) none
@@ -457,8 +455,8 @@ linux04          1      lsf*        idle    4   3813        1      1   (null) no
 
 <pre>
 # squeue
-  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
-      5       lsf    sleep     root   R       1:52      3 linux[01-03]
+JOBID PARTITION   NAME   USER  ST   TIME  NODES NODELIST(REASON)
+    5       lsf  sleep   root   R   1:52      3 linux[01-03]
 </pre>
 
 <p>Job 3 and Job 4 have finshed and Job 5 is still running on nodes linux[01-03].</p>
@@ -480,6 +478,6 @@ one mpi process per node.</p>
 
 <p class="footer"><a href="#top">top</a></p>
 
-<p style="text-align:center;">Last modified 25 September 2006</p>
+<p style="text-align:center;">Last modified 8 July 2008</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/html/cons_res_share.shtml b/doc/html/cons_res_share.shtml
index 66715e41d32..b007fdf70ab 100644
--- a/doc/html/cons_res_share.shtml
+++ b/doc/html/cons_res_share.shtml
@@ -173,7 +173,9 @@ to begin running "on top of" the existing jobs. This happens with the
 
 <H3>Memory Management</H3>
 <P>
-The management of memory as a consumable resource remains unchanged:
+The management of memory as a consumable resource remains unchanged and
+can be used to prevent oversubscription of memory, which would result in
+having memory pages swapped out and severely degraded performance.
 </P>
 <TABLE CELLPADDING=3 CELLSPACING=1 BORDER=1>
 <TR><TH>Selection Setting</TH>
@@ -202,21 +204,26 @@ available memory to meet the job's memory requirement will not be allocated to
 the job.</TD>
 </TR>
 </TABLE>
-<P>Note that the <CODE>srun --mem=&lt;num&gt;</CODE> option is only used to
-request nodes that have &lt;num&gt; amount of real memory. This option does not
-compute memory that is currently available.
-</P><P>
-The <CODE>srun --job-mem=&lt;num&gt;</CODE> option is used with the
-<CODE>select/cons_res</CODE> plugin to request available memory from each node.
-</P><P>
-The <CODE>select/cons_res</CODE> plugin tracks memory usage by each job on each
-node regardless of the number partitions a node may be assigned to. The primary
-purpose of tracking memory as a consumable resource is to protect jobs from
-having their memory pages swapped out because the memory has been overcommitted.
-</P>
+<P>Users can specify their job's memory requirements one of two ways.
+<CODE>--mem=&lt;num&gt;</CODE> can be used to specify the job's memory 
+requirement on a per allocated node basis. This option is probably best 
+suited for use with the <CODE>select/linear</CODE> plugin, which allocates 
+whole nodes to jobs. 
+<CODE>--mem-per-cpu=&lt;num&gt;</CODE> can be used to specify the job's 
+memory requirement on a per allocated CPU basis. This is probably best
+suited for use with the <CODE>select/cons_res</CODE> plugin which can 
+allocate individual CPUs to jobs.</P>
+
+<P>Default and maximum values for memory on a per node or per CPU basis can 
+be configued using the following options: <CODE>DefMemPerCPU</CODE>,
+<CODE>DefMemPerNode</CODE>, <CODE>MaxMemPerCPU</CODE> and <CODE>MaxMemPerNode</CODE>.
+Enforcement of a job's memory allocation is performed by the accounting 
+plugin, which periodically gathers data about running jobs. Set 
+<CODE>JobAcctGather</CODE> and <CODE>JobAcctFrequency</CODE> to 
+values suitable for your system.</P>
 
 <p class="footer"><a href="#top">top</a></p>
 
-<p style="text-align:center;">Last modified 27 May 2008</p>
+<p style="text-align:center;">Last modified 8 July 2008</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/html/gang_scheduling.shtml b/doc/html/gang_scheduling.shtml
index 66c0b7cf690..b249cb6eb7f 100644
--- a/doc/html/gang_scheduling.shtml
+++ b/doc/html/gang_scheduling.shtml
@@ -8,29 +8,30 @@ to jobs.
 Beginning in SLURM version 1.3, gang scheduling is supported. 
 Gang scheduling is when two or more jobs are allocated to the same resources 
 and these jobs are alternately suspended to let all of the tasks of each 
-job have full access to the shared resources for a period of time.
+job have full access to the shared resources for a period of time.
 </P>
-<P>
+<P>
 A resource manager that supports timeslicing can improve it's responsiveness
 and utilization by allowing more jobs to begin running sooner. Shorter-running 
 jobs no longer have to wait in a queue behind longer-running jobs. Instead they 
 can be run "in parallel" with the longer-running jobs, which will allow them 
 to finish quicker. Throughput is also improved because overcommitting the 
 resources provides opportunities for "local backfilling" to occur (see example 
-below).
+below).
 </P>
-<P>
+<P>
 The SLURM 1.3.0 the <I>sched/gang</I> plugin provides timeslicing. When enabled, 
-it monitors each of the partitions in SLURM. If a new job has been allocated to
-resources in a partition that have already been allocated to an existing job,
-then the plugin will suspend the new job until the configured
-<I>SchedulerTimeslice</I> interval has elapsed. Then it will suspend the
-running job and let the new job make use of the resources for a 
-<I>SchedulerTimeslice</I> interval. This will continue until one of the
-jobs terminates.
+it monitors each of the partitions in SLURM. If a new job has been allocated to
+resources in a partition that have already been allocated to an existing job,
+then the plugin will suspend the new job until the configured
+<I>SchedulerTimeslice</I> interval has elapsed. Then it will suspend the
+running job and let the new job make use of the resources for a 
+<I>SchedulerTimeslice</I> interval. This will continue until one of the
+jobs terminates.
 </P>
 
 <H2>Configuration</H2>
+
 <P>
 There are several important configuration parameters relating to 
 gang scheduling:
@@ -46,15 +47,18 @@ allocated by the <I>select/cons_res</I> plugin.
 with jobs, the resource selection plugin should be configured to track the 
 amount of memory used by each job to ensure that memory page swapping does 
 not occur. When <I>select/linear</I> is chosen, we recommend setting 
-<I>SelectTypeParameter=CR_Memory</I>. When <I>select/cons_res</I> is
-chosen, we recommend including Memory as a resource (ex.
+<I>SelectTypeParameter=CR_Memory</I>. When <I>select/cons_res</I> is
+chosen, we recommend including Memory as a resource (ex.
 <I>SelectTypeParameter=CR_Core_Memory</I>).
 </LI>
 <LI>
-<B>DefMemPerTask</B>: Since job requests may not explicitly specify 
-a memory requirement, we also recommend configuring <I>DefMemPerTask</I> 
-(default memory per task). It may also be desirable to configure 
-<I>MaxMemPerTask</I> (maximum memory per task) in <I>slurm.conf</I>.
+<B>DefMemPerCPU</B>: Since job requests may not explicitly specify 
+a memory requirement, we also recommend configuring
+<I>DefMemPerCPU</I> (default memory per allocated CPU) or
+<I>DefMemPerNode</I> (default memory per allocated node).
+It may also be desirable to configure
+<I>MaxMemPerCPU</I> (maximum memory per allocated CPU) or
+<I>MaxMemPerNode</I> (maximum memory per allocated node) in <I>slurm.conf</I>.
 </LI>
 <LI>
 <B>JobAcctGatherType and JobAcctGatherFrequency</B>:
@@ -63,9 +67,9 @@ using the <I>JobAcctGatherType</I> and <I>JobAcctGatherFrequency</I>
 parameters. If accounting is enabled and a job exceeds its configured
 memory limits, it will be canceled in order to prevent it from 
 adversely effecting other jobs sharing the same resources.
-</LI>
+</LI>
 <LI>
-<B>SchedulerType</B>: Configure the <I>sched/gang</I> plugin by setting
+<B>SchedulerType</B>: Configure the <I>sched/gang</I> plugin by setting
 <I>SchedulerType=sched/gang</I> in <I>slurm.conf</I>.
 </LI>
 <LI>
@@ -88,7 +92,7 @@ allocated to a common resource, set <I>Shared=FORCE:6</I>.
 In order to enable gang scheduling after making the configuration changes 
 described above, restart SLURM if it is already running. Any change to the 
 plugin settings in SLURM requires a full restart of the daemons. If you 
-just change the partition <I>Shared</I> setting, this can be updated with
+just change the partition <I>Shared</I> setting, this can be updated with
 <I>scontrol reconfig</I>.
 </P>
 <P>
@@ -96,377 +100,412 @@ For an advanced topic discussion on the potential use of swap space,
 see "Making use of swap space" in the "Future Work" section below.
 </P>
 
-<H2>Timeslicer Design and Operation</H2>
+<H2>Timeslicer Design and Operation</H2>
 
 <P>
-When enabled, the <I>sched/gang</I> plugin keeps track of the resources
-allocated to all jobs. For each partition an "active bitmap" is maintained that
-tracks all concurrently running jobs in the SLURM cluster. Each time a new
-job is allocated to resources in a partition, the <I>sched/gang</I> plugin
-compares these newly allocated resources with the resources already maintained
-in the "active bitmap". If these two sets of resources are disjoint then the new
-job is added to the "active bitmap". If these two sets of resources overlap then
-the new job is suspended. All jobs are tracked in a per-partition job queue
+When enabled, the <I>sched/gang</I> plugin keeps track of the resources
+allocated to all jobs. For each partition an "active bitmap" is maintained that
+tracks all concurrently running jobs in the SLURM cluster. Each time a new
+job is allocated to resources in a partition, the <I>sched/gang</I> plugin
+compares these newly allocated resources with the resources already maintained
+in the "active bitmap". If these two sets of resources are disjoint then the new
+job is added to the "active bitmap". If these two sets of resources overlap then
+the new job is suspended. All jobs are tracked in a per-partition job queue
 within the <I>sched/gang</I> plugin.
 </P>
 <P>
-A separate <I>timeslicer thread</I> is spawned by the <I>sched/gang</I> plugin
-on startup. This thread sleeps for the configured <I>SchedulerTimeSlice</I>
-interval. When it wakes up, it checks each partition for suspended jobs. If
-suspended jobs are found then the <I>timeslicer thread</I> moves all running
-jobs to the end of the job queue. It then reconstructs the "active bitmap" for
-this partition beginning with the suspended job that has waited the longest to
-run (this will be the first suspended job in the run queue). Each following job
-is then compared with the new "active bitmap", and if the job can be run
-concurrently with the other "active" jobs then the job is added. Once this is
-complete then the <I>timeslicer thread</I> suspends any currently running jobs
-that are no longer part of the "active bitmap", and resumes jobs that are new to
+A separate <I>timeslicer thread</I> is spawned by the <I>sched/gang</I> plugin
+on startup. This thread sleeps for the configured <I>SchedulerTimeSlice</I>
+interval. When it wakes up, it checks each partition for suspended jobs. If
+suspended jobs are found then the <I>timeslicer thread</I> moves all running
+jobs to the end of the job queue. It then reconstructs the "active bitmap" for
+this partition beginning with the suspended job that has waited the longest to
+run (this will be the first suspended job in the run queue). Each following job
+is then compared with the new "active bitmap", and if the job can be run
+concurrently with the other "active" jobs then the job is added. Once this is
+complete then the <I>timeslicer thread</I> suspends any currently running jobs
+that are no longer part of the "active bitmap", and resumes jobs that are new to
 the "active bitmap".
 </P>
 <P>
-This <I>timeslicer thread</I> algorithm for rotating jobs is designed to prevent
-jobs from starving (remaining in the suspended state indefinitly) and to be as
-fair as possible in the distribution of runtime while still keeping all of the
+This <I>timeslicer thread</I> algorithm for rotating jobs is designed to prevent
+jobs from starving (remaining in the suspended state indefinitly) and to be as
+fair as possible in the distribution of runtime while still keeping all of the
 resources as busy as possible.
 </P>
 <P>
-The <I>sched/gang</I> plugin suspends jobs via the same internal functions that
-support <I>scontrol suspend</I> and <I>scontrol resume</I>. A good way to
-observe the operation of the timeslicer is by running <I>watch squeue</I> in a
+The <I>sched/gang</I> plugin suspends jobs via the same internal functions that
+support <I>scontrol suspend</I> and <I>scontrol resume</I>. A good way to
+observe the operation of the timeslicer is by running <I>watch squeue</I> in a
 terminal window.
 </P>
 
-<H2>A Simple Example</H2>
+<H2>A Simple Example</H2>
 
 <P>
-The following example is configured with <I>select/linear</I>,
-<I>sched/gang</I>, and <I>Shared=FORCE</I>. This example takes place on a small
+The following example is configured with <I>select/linear</I>,
+<I>sched/gang</I>, and <I>Shared=FORCE</I>. This example takes place on a small
 cluster of 5 nodes:
-</P>
-<PRE>
-[user@n16 load]$ <B>sinfo</B>
-PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
-active*      up   infinite     5   idle n[12-16]
+</P>
+<PRE>
+[user@n16 load]$ <B>sinfo</B>
+PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
+active*      up   infinite     5   idle n[12-16]
 </PRE>
 <P>
 Here are the Scheduler settings (the last two settings are the relevant ones):
 </P>
-<PRE>
-[user@n16 load]$ <B>scontrol show config | grep Sched</B>
-FastSchedule            = 1
-SchedulerPort           = 7321
-SchedulerRootFilter     = 1
-SchedulerTimeSlice      = 30
-SchedulerType           = sched/gang
-[user@n16 load]$
-</PRE>
-<P>
-The <I>myload</I> script launches a simple load-generating app that runs
+<PRE>
+[user@n16 load]$ <B>scontrol show config | grep Sched</B>
+FastSchedule            = 1
+SchedulerPort           = 7321
+SchedulerRootFilter     = 1
+SchedulerTimeSlice      = 30
+SchedulerType           = sched/gang
+</PRE>
+<P>
+The <I>myload</I> script launches a simple load-generating app that runs
 for the given number of seconds. Submit <I>myload</I> to run on all nodes:
 </P>
-<PRE>
-[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
-sbatch: Submitted batch job 3
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    3    active  myload  user     0:05     5 n[12-16]
+<PRE>
+[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
+sbatch: Submitted batch job 3
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    3    active  myload  user     0:05     5 n[12-16]
 </PRE>
 <P>
 Submit it again and watch the <I>sched/gang</I> plugin suspend it:
 </P>
-<PRE>
-[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
-sbatch: Submitted batch job 4
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    3    active  myload  user  R  0:13     5 n[12-16]
-    4    active  myload  user  S  0:00     5 n[12-16]
+<PRE>
+[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
+sbatch: Submitted batch job 4
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    3    active  myload  user  R  0:13     5 n[12-16]
+    4    active  myload  user  S  0:00     5 n[12-16]
 </PRE>
 <P>
-After 30 seconds the <I>sched/gang</I> plugin swaps jobs, and now job 4 is the
+After 30 seconds the <I>sched/gang</I> plugin swaps jobs, and now job 4 is the
 active one:
 </P>
-<PRE>
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    4    active  myload  user  R  0:08     5 n[12-16]
-    3    active  myload  user  S  0:41     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    4    active  myload  user  R  0:21     5 n[12-16]
-    3    active  myload  user  S  0:41     5 n[12-16]
+<PRE>
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    4    active  myload  user  R  0:08     5 n[12-16]
+    3    active  myload  user  S  0:41     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    4    active  myload  user  R  0:21     5 n[12-16]
+    3    active  myload  user  S  0:41     5 n[12-16]
+</PRE>
+<P>
+After another 30 seconds the <I>sched/gang</I> plugin sets job 3 running again:
+</P>
+<PRE>
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    3    active  myload  user  R  0:50     5 n[12-16]
+    4    active  myload  user  S  0:30     5 n[12-16]
+</PRE>
+
+<P>
+<B>A possible side effect of timeslicing</B>: Note that jobs that are
+immediately suspended may cause their srun commands to produce the following
+output:
+</P>
+<PRE>
+[user@n16 load]$ <B>cat slurm-4.out</B>
+srun: Job step creation temporarily disabled, retrying
+srun: Job step creation still disabled, retrying
+srun: Job step creation still disabled, retrying
+srun: Job step creation still disabled, retrying
+srun: Job step created
+</PRE>
+<P>
+This occurs because <I>srun</I> is attempting to launch a jobstep in an
+allocation that has been suspended. The <I>srun</I> process will continue in a
+retry loop to launch the jobstep until the allocation has been resumed and the
+jobstep can be launched.
+</P>
+<P>
+When the <I>sched/gang</I> plugin is enabled, this type of output in the user
+jobs should be considered benign.
+</P>
+
+<H2>More examples</H2>
+
+<P>
+The following example shows how the timeslicer algorithm keeps the resources
+busy. Job 10 runs continually, while jobs 9 and 11 are timesliced:
+</P>
+
+<PRE>
+[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
+sbatch: Submitted batch job 9
+
+[user@n16 load]$ <B>sbatch -N2 ./myload 300</B>
+sbatch: Submitted batch job 10
+
+[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
+sbatch: Submitted batch job 11
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    9    active  myload  user  R  0:11     3 n[12-14]
+   10    active  myload  user  R  0:08     2 n[15-16]
+   11    active  myload  user  S  0:00     3 n[12-14]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   10    active  myload  user  R  0:50     2 n[15-16]
+   11    active  myload  user  R  0:12     3 n[12-14]
+    9    active  myload  user  S  0:41     3 n[12-14]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   10    active  myload  user  R  1:04     2 n[15-16]
+   11    active  myload  user  R  0:26     3 n[12-14]
+    9    active  myload  user  S  0:41     3 n[12-14]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+    9    active  myload  user  R  0:46     3 n[12-14]
+   10    active  myload  user  R  1:13     2 n[15-16]
+   11    active  myload  user  S  0:30     3 n[12-14]
 </PRE>
-<P>
After another 30 seconds the <I>sched/gang</I> plugin sets job 3 running again:
-</P>
-<PRE>
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    3    active  myload  user  R  0:50     5 n[12-16]
-    4    active  myload  user  S  0:30     5 n[12-16]
-</PRE>
-<P>
-<B>A possible side effect of timeslicing</B>: Note that jobs that are
-immediately suspended may cause their srun commands to produce the following
-output:
-</P>
-<PRE>
-[user@n16 load]$ <B>cat slurm-4.out</B>
-srun: Job step creation temporarily disabled, retrying
-srun: Job step creation still disabled, retrying
-srun: Job step creation still disabled, retrying
-srun: Job step creation still disabled, retrying
-srun: Job step created
+</P>
+<P>
+The next example displays "local backfilling":
+</P>
+<PRE>
+[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
+sbatch: Submitted batch job 12
+
+[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
+sbatch: Submitted batch job 13
+
+[user@n16 load]$ <B>sbatch -N2 ./myload 300</B>
+sbatch: Submitted batch job 14
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   12    active  myload  user  R  0:14     3 n[12-14]
+   14    active  myload  user  R  0:06     2 n[15-16]
+   13    active  myload  user  S  0:00     5 n[12-16]
 </PRE>
-<P>
-This occurs because <I>srun</I> is attempting to launch a jobstep in an
-allocation that has been suspended. The <I>srun</I> process will continue in a
-retry loop to launch the jobstep until the allocation has been resumed and the
-jobstep can be launched.
-</P>
-<P>
-When the <I>sched/gang</I> plugin is enabled, this type of output in the user
-jobs should be considered benign.
-</P>
-
-<H2>More examples</H2>
-<P>
-The following example shows how the timeslicer algorithm keeps the resources
-busy. Job 10 runs continually, while jobs 9 and 11 are timesliced:
-</P>
-<PRE>
-[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
-sbatch: Submitted batch job 9
-[user@n16 load]$ <B>sbatch -N2 ./myload 300</B>
-sbatch: Submitted batch job 10
-[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
-sbatch: Submitted batch job 11
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    9    active  myload  user  R  0:11     3 n[12-14]
-   10    active  myload  user  R  0:08     2 n[15-16]
-   11    active  myload  user  S  0:00     3 n[12-14]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   10    active  myload  user  R  0:50     2 n[15-16]
-   11    active  myload  user  R  0:12     3 n[12-14]
-    9    active  myload  user  S  0:41     3 n[12-14]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   10    active  myload  user  R  1:04     2 n[15-16]
-   11    active  myload  user  R  0:26     3 n[12-14]
-    9    active  myload  user  S  0:41     3 n[12-14]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-    9    active  myload  user  R  0:46     3 n[12-14]
-   10    active  myload  user  R  1:13     2 n[15-16]
-   11    active  myload  user  S  0:30     3 n[12-14]
-[user@n16 load]$
-</PRE>
-</P>
-<P>
-The next example displays "local backfilling":
-</P>
-<PRE>
-[user@n16 load]$ <B>sbatch -N3 ./myload 300</B>
-sbatch: Submitted batch job 12
-[user@n16 load]$ <B>sbatch -N5 ./myload 300</B>
-sbatch: Submitted batch job 13
-[user@n16 load]$ <B>sbatch -N2 ./myload 300</B>
-sbatch: Submitted batch job 14
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   12    active  myload  user  R  0:14     3 n[12-14]
-   14    active  myload  user  R  0:06     2 n[15-16]
-   13    active  myload  user  S  0:00     5 n[12-16]
-[user@n16 load]$
-</PRE>
-<P>
-Without timeslicing and without the backfill scheduler enabled, job 14 has to
-wait for job 13 to finish.
-</P><P>
-This is called "local" backfilling because the backfilling only occurs with jobs
-close enough in the queue to get allocated by the scheduler as part of
-oversubscribing the resources. Recall that the number of jobs that can
-overcommit a resource is controlled by the <I>Shared=FORCE:max_share</I> value,
-so this value effectively controls the scope of "local backfilling".
-</P><P>
-Normal backfill algorithms check <U>all</U> jobs in the wait queue.
-</P>
-
-<H2>Consumable Resource Examples</H2>
-<P>
-The following two examples illustrate the primary difference between
-<I>CR_CPU</I> and <I>CR_Core</I> when consumable resource selection is enabled
-(<I>select/cons_res</I>).
-</P>
-<P>
-When <I>CR_CPU</I> (or <I>CR_CPU_Memory</I>) is configured then the selector
-treats the CPUs as simple, <I>interchangeable</I> computing resources. However
-when <I>CR_Core</I> (or <I>CR_Core_Memory</I>) is enabled the selector treats
-the CPUs as individual resources that are <U>specifically</U> allocated to jobs.
-This subtle difference is highlighted when timeslicing is enabled.
-</P>
-<P>
-In both examples 6 jobs are submitted. Each job requests 2 CPUs per node, and
-all of the nodes contain two quad-core processors. The timeslicer will initially
-let the first 4 jobs run and suspend the last 2 jobs. The manner in which these
-jobs are timesliced depends upon the configured <I>SelectTypeParameter</I>.
-</P>
-<P>
-In the first example <I>CR_Core_Memory</I> is configured. Note that jobs 46 and
-47 don't <U>ever</U> get suspended. This is because they are not sharing their
-cores with any other job. Jobs 48 and 49 were allocated to the same cores as
-jobs 45 and 46. The timeslicer recognizes this and timeslices only those jobs: 
-</P>
-<PRE>
-[user@n16 load]$ <B>sinfo</B>
-PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
-active*      up   infinite     5   idle n[12-16]
-[user@n16 load]$ <B>scontrol show config | grep Select</B>
-SelectType              = select/cons_res
-SelectTypeParameters    = CR_CORE_MEMORY
-[user@n16 load]$ <B>sinfo -o "%20N %5D %5c %5z"</B>
-NODELIST             NODES CPUS  S:C:T
-n[12-16]             5     8     2:4:1
-[user@n16 load]$
-[user@n16 load]$
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 44
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 45
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 46
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 47
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 48
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 49
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   44    active  myload  user  R  0:09     5 n[12-16]
-   45    active  myload  user  R  0:08     5 n[12-16]
-   46    active  myload  user  R  0:08     5 n[12-16]
-   47    active  myload  user  R  0:07     5 n[12-16]
-   48    active  myload  user  S  0:00     5 n[12-16]
-   49    active  myload  user  S  0:00     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   46    active  myload  user  R  0:49     5 n[12-16]
-   47    active  myload  user  R  0:48     5 n[12-16]
-   48    active  myload  user  R  0:06     5 n[12-16]
-   49    active  myload  user  R  0:06     5 n[12-16]
-   44    active  myload  user  S  0:44     5 n[12-16]
-   45    active  myload  user  S  0:43     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   44    active  myload  user  R  1:23     5 n[12-16]
-   45    active  myload  user  R  1:22     5 n[12-16]
-   46    active  myload  user  R  2:22     5 n[12-16]
-   47    active  myload  user  R  2:21     5 n[12-16]
-   48    active  myload  user  S  1:00     5 n[12-16]
-   49    active  myload  user  S  1:00     5 n[12-16]
-[user@n16 load]$
-</PRE>
-<P>
-Note the runtime of all 6 jobs in the output of the last <I>squeue</I> command.
-Jobs 46 and 47 have been running continuously, while jobs 45 and 46 are
-splitting their runtime with jobs 48 and 49.
-</P><P>
-The next example has <I>CR_CPU_Memory</I> configured and the same 6 jobs are
-submitted. Here the selector and the timeslicer treat the CPUs as countable
-resources which results in all 6 jobs sharing time on the CPUs:
-</P>
-<PRE>
-[user@n16 load]$ <B>sinfo</B>
-PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
-active*      up   infinite     5   idle n[12-16]
-[user@n16 load]$ <B>scontrol show config | grep Select</B>
-SelectType              = select/cons_res
-SelectTypeParameters    = CR_CPU_MEMORY
-[user@n16 load]$ <B>sinfo -o "%20N %5D %5c %5z"</B>
-NODELIST             NODES CPUS  S:C:T
-n[12-16]             5     8     2:4:1
-[user@n16 load]$
-[user@n16 load]$
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 51
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 52
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 53
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 54
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 55
-[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
-sbatch: Submitted batch job 56
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   51    active  myload  user  R  0:11     5 n[12-16]
-   52    active  myload  user  R  0:11     5 n[12-16]
-   53    active  myload  user  R  0:10     5 n[12-16]
-   54    active  myload  user  R  0:09     5 n[12-16]
-   55    active  myload  user  S  0:00     5 n[12-16]
-   56    active  myload  user  S  0:00     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   51    active  myload  user  R  1:09     5 n[12-16]
-   52    active  myload  user  R  1:09     5 n[12-16]
-   55    active  myload  user  R  0:23     5 n[12-16]
-   56    active  myload  user  R  0:23     5 n[12-16]
-   53    active  myload  user  S  0:45     5 n[12-16]
-   54    active  myload  user  S  0:44     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   53    active  myload  user  R  0:55     5 n[12-16]
-   54    active  myload  user  R  0:54     5 n[12-16]
-   55    active  myload  user  R  0:40     5 n[12-16]
-   56    active  myload  user  R  0:40     5 n[12-16]
-   51    active  myload  user  S  1:16     5 n[12-16]
-   52    active  myload  user  S  1:16     5 n[12-16]
-[user@n16 load]$ <B>squeue</B>
-JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
-   51    active  myload  user  R  3:18     5 n[12-16]
-   52    active  myload  user  R  3:18     5 n[12-16]
-   53    active  myload  user  R  3:17     5 n[12-16]
-   54    active  myload  user  R  3:16     5 n[12-16]
-   55    active  myload  user  S  3:00     5 n[12-16]
-   56    active  myload  user  S  3:00     5 n[12-16]
-[user@n16 load]$
-</PRE>
-<P>
-Note that the runtime of all 6 jobs is roughly equal. Jobs 51-54 ran first so
-they're slightly ahead, but so far all jobs have run for at least 3 minutes.
-</P><P>
-At the core level this means that SLURM relies on the linux kernel to move jobs
-around on the cores to maximize performance. This is different than when
-<I>CR_Core_Memory</I> was configured and the jobs would effectively remain
-"pinned" to their specific cores for the duration of the job. Note that
-<I>CR_Core_Memory</I> supports CPU binding, while <I>CR_CPU_Memory</I> does not.
+<P>
+Without timeslicing and without the backfill scheduler enabled, job 14 has to
+wait for job 13 to finish.
+</P>
+<P>
+This is called "local" backfilling because the backfilling only occurs with jobs
+close enough in the queue to get allocated by the scheduler as part of
+oversubscribing the resources. Recall that the number of jobs that can
+overcommit a resource is controlled by the <I>Shared=FORCE:max_share</I> value,
+so this value effectively controls the scope of "local backfilling".
+</P>
+<P>
+Normal backfill algorithms check <U>all</U> jobs in the wait queue.
+</P>
+
+<H2>Consumable Resource Examples</H2>
+
+<P>
+The following two examples illustrate the primary difference between
+<I>CR_CPU</I> and <I>CR_Core</I> when consumable resource selection is enabled
+(<I>select/cons_res</I>).
+</P>
+<P>
+When <I>CR_CPU</I> (or <I>CR_CPU_Memory</I>) is configured then the selector
+treats the CPUs as simple, <I>interchangeable</I> computing resources. However
+when <I>CR_Core</I> (or <I>CR_Core_Memory</I>) is enabled the selector treats
+the CPUs as individual resources that are <U>specifically</U> allocated to jobs.
+This subtle difference is highlighted when timeslicing is enabled.
+</P>
+<P>
+In both examples 6 jobs are submitted. Each job requests 2 CPUs per node, and
+all of the nodes contain two quad-core processors. The timeslicer will initially
+let the first 4 jobs run and suspend the last 2 jobs. The manner in which these
+jobs are timesliced depends upon the configured <I>SelectTypeParameter</I>.
+</P>
+<P>
+In the first example <I>CR_Core_Memory</I> is configured. Note that jobs 46 and
+47 don't <U>ever</U> get suspended. This is because they are not sharing their
+cores with any other job. Jobs 48 and 49 were allocated to the same cores as
+jobs 45 and 46. The timeslicer recognizes this and timeslices only those jobs: 
+</P>
+<PRE>
+[user@n16 load]$ <B>sinfo</B>
+PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
+active*      up   infinite     5   idle n[12-16]
+
+[user@n16 load]$ <B>scontrol show config | grep Select</B>
+SelectType              = select/cons_res
+SelectTypeParameters    = CR_CORE_MEMORY
+
+[user@n16 load]$ <B>sinfo -o "%20N %5D %5c %5z"</B>
+NODELIST             NODES CPUS  S:C:T
+n[12-16]             5     8     2:4:1
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 44
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 45
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 46
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 47
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 48
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 49
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   44    active  myload  user  R  0:09     5 n[12-16]
+   45    active  myload  user  R  0:08     5 n[12-16]
+   46    active  myload  user  R  0:08     5 n[12-16]
+   47    active  myload  user  R  0:07     5 n[12-16]
+   48    active  myload  user  S  0:00     5 n[12-16]
+   49    active  myload  user  S  0:00     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   46    active  myload  user  R  0:49     5 n[12-16]
+   47    active  myload  user  R  0:48     5 n[12-16]
+   48    active  myload  user  R  0:06     5 n[12-16]
+   49    active  myload  user  R  0:06     5 n[12-16]
+   44    active  myload  user  S  0:44     5 n[12-16]
+   45    active  myload  user  S  0:43     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   44    active  myload  user  R  1:23     5 n[12-16]
+   45    active  myload  user  R  1:22     5 n[12-16]
+   46    active  myload  user  R  2:22     5 n[12-16]
+   47    active  myload  user  R  2:21     5 n[12-16]
+   48    active  myload  user  S  1:00     5 n[12-16]
+   49    active  myload  user  S  1:00     5 n[12-16]
+</PRE>
+<P>
+Note the runtime of all 6 jobs in the output of the last <I>squeue</I> command.
+Jobs 46 and 47 have been running continuously, while jobs 45 and 46 are
+splitting their runtime with jobs 48 and 49.
+</P>
+<P>
+The next example has <I>CR_CPU_Memory</I> configured and the same 6 jobs are
+submitted. Here the selector and the timeslicer treat the CPUs as countable
+resources which results in all 6 jobs sharing time on the CPUs:
+</P>
+<PRE>
+[user@n16 load]$ <B>sinfo</B>
+PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
+active*      up   infinite     5   idle n[12-16]
+
+[user@n16 load]$ <B>scontrol show config | grep Select</B>
+SelectType              = select/cons_res
+SelectTypeParameters    = CR_CPU_MEMORY
+
+[user@n16 load]$ <B>sinfo -o "%20N %5D %5c %5z"</B>
+NODELIST             NODES CPUS  S:C:T
+n[12-16]             5     8     2:4:1
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 51
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 52
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 53
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 54
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 55
+
+[user@n16 load]$ <B>sbatch -n10 -N5 ./myload 300</B>
+sbatch: Submitted batch job 56
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   51    active  myload  user  R  0:11     5 n[12-16]
+   52    active  myload  user  R  0:11     5 n[12-16]
+   53    active  myload  user  R  0:10     5 n[12-16]
+   54    active  myload  user  R  0:09     5 n[12-16]
+   55    active  myload  user  S  0:00     5 n[12-16]
+   56    active  myload  user  S  0:00     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   51    active  myload  user  R  1:09     5 n[12-16]
+   52    active  myload  user  R  1:09     5 n[12-16]
+   55    active  myload  user  R  0:23     5 n[12-16]
+   56    active  myload  user  R  0:23     5 n[12-16]
+   53    active  myload  user  S  0:45     5 n[12-16]
+   54    active  myload  user  S  0:44     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   53    active  myload  user  R  0:55     5 n[12-16]
+   54    active  myload  user  R  0:54     5 n[12-16]
+   55    active  myload  user  R  0:40     5 n[12-16]
+   56    active  myload  user  R  0:40     5 n[12-16]
+   51    active  myload  user  S  1:16     5 n[12-16]
+   52    active  myload  user  S  1:16     5 n[12-16]
+
+[user@n16 load]$ <B>squeue</B>
+JOBID PARTITION    NAME  USER ST  TIME NODES NODELIST
+   51    active  myload  user  R  3:18     5 n[12-16]
+   52    active  myload  user  R  3:18     5 n[12-16]
+   53    active  myload  user  R  3:17     5 n[12-16]
+   54    active  myload  user  R  3:16     5 n[12-16]
+   55    active  myload  user  S  3:00     5 n[12-16]
+   56    active  myload  user  S  3:00     5 n[12-16]
+</PRE>
+<P>
+Note that the runtime of all 6 jobs is roughly equal. Jobs 51-54 ran first so
+they're slightly ahead, but so far all jobs have run for at least 3 minutes.
+</P>
+<P>
+At the core level this means that SLURM relies on the linux kernel to move jobs
+around on the cores to maximize performance. This is different than when
+<I>CR_Core_Memory</I> was configured and the jobs would effectively remain
+"pinned" to their specific cores for the duration of the job. Note that
+<I>CR_Core_Memory</I> supports CPU binding, while <I>CR_CPU_Memory</I> does not.
 </P>
 
 <H2>Future Work</H2>
-
-<P>
-Priority scheduling and preemptive scheduling are other forms of gang
-scheduling that are currently under development for SLURM.
-</P>
-<P>
-<B>Making use of swap space</B>: (note that this topic is not currently
-scheduled for development, unless someone would like to pursue this) It should
-be noted that timeslicing does provide an interesting mechanism for high
-performance jobs to make use of swap space. The optimal scenario is one in which
-suspended jobs are "swapped out" and active jobs are "swapped in". The swapping
-activity would only occur once every  <I>SchedulerTimeslice</I> interval.
-</P>
-<P>
-However, SLURM should first be modified to include support for scheduling jobs
-into swap space and to provide controls to prevent overcommitting swap space.
-For now this idea could be experimented with by disabling memory support in the
-selector and submitting appropriately sized jobs.
-</P>
-
-<p style="text-align:center;">Last modified 17 March 2008</p>
+
+<P>
+Priority scheduling and preemptive scheduling are other forms of gang
+scheduling that are currently under development for SLURM.
+</P>
+<P>
+<B>Making use of swap space</B>: (note that this topic is not currently
+scheduled for development, unless someone would like to pursue this) It should
+be noted that timeslicing does provide an interesting mechanism for high
+performance jobs to make use of swap space. The optimal scenario is one in which
+suspended jobs are "swapped out" and active jobs are "swapped in". The swapping
+activity would only occur once every  <I>SchedulerTimeslice</I> interval.
+</P>
+<P>
+However, SLURM should first be modified to include support for scheduling jobs
+into swap space and to provide controls to prevent overcommitting swap space.
+For now this idea could be experimented with by disabling memory support in the
+selector and submitting appropriately sized jobs.
+</P>
+
+<p style="text-align:center;">Last modified 7 July 2008</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml
index d58acf00324..f9fd8c0b9db 100644
--- a/doc/html/preempt.shtml
+++ b/doc/html/preempt.shtml
@@ -44,10 +44,13 @@ chosen, we recommend setting <I>SelectTypeParameter=CR_Memory</I>. When
 (ex. <I>SelectTypeParameter=CR_Core_Memory</I>).
 </LI>
 <LI>
-<B>DefMemPerTask</B>: Since job requests may not explicitly specify 
-a memory requirement, we also recommend configuring <I>DefMemPerTask</I> 
-(default memory per task). It may also be desirable to configure 
-<I>MaxMemPerTask</I> (maximum memory per task) in <I>slurm.conf</I>.
+<B>DefMemPerCPU</B>: Since job requests may not explicitly specify 
+a memory requirement, we also recommend configuring 
+<I>DefMemPerCPU</I> (default memory per allocated CPU) or 
+<I>DefMemPerNode</I> (default memory per allocated node). 
+It may also be desirable to configure 
+<I>MaxMemPerCPU</I> (maximum memory per allocated CPU) or 
+<I>MaxMemPerNode</I> (maximum memory per allocated node) in <I>slurm.conf</I>.
 </LI>
 <LI>
 <B>JobAcctGatherType and JobAcctGatherFrequency</B>:
@@ -242,6 +245,6 @@ again. This will be investigated at some point in the future. Requeuing a
 preempted job may make the most sense with <I>Shared=NO</I> partitions.
 </P>
 
-<p style="text-align:center;">Last modified 11 April 2008</p>
+<p style="text-align:center;">Last modified 7 July 2008</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index 72a8bcdb59f..356f916d006 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -1,4 +1,4 @@
-.TH "salloc" "1" "SLURM 1.3" "May 2008" "SLURM Commands"
+.TH "salloc" "1" "SLURM 1.3" "July 2008" "SLURM Commands"
 .SH "NAME"
 .LP 
 salloc \- Obtain a SLURM job allocation (a set of nodes), execute a command, and then release the allocation when the command is finished.
@@ -306,12 +306,24 @@ The default value is the username of the submitting user.
 .TP
 \fB\-\-mem\fR[=]<\fIMB\fR>
 Specify the real memory required per node in MegaBytes.
-If a value is specified, that quantity of memory will be 
-reserved for this job. 
-If no value is specified and real memory is exhausted on 
-any allocated node then the job is subject to cancellation.
-Also see \fB\-\-task\-mem\fR.
-
+Default value is \fBDefMemPerNode\fR and the maximum value is
+\fBMaxMemPerNode\fR. If configured, both of parameters can be
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of whole nodes
+are allocated to jobs (\fBSelectType=select/linear\fR).
+Also see \fB\-\-mem\-per\-cpu\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
+
+.TP
+\fB\-\-mem\-per\-cpu\fR[=]<\fIMB\fR>
+Mimimum memory required per allocated CPU in MegaBytes.
+Default value is \fBDefMemPerCPU\fR and the maximum value is
+\fBMaxMemPerCPU\fR. If configured, both of parameters can be 
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of individual processors
+are allocated to jobs (\fBSelectType=select/cons_res\fR).
+Also see \fB\-\-mem\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
 
 .TP
 \fB\-\-mincores\fR[=]<\fIn\fR>
@@ -495,13 +507,6 @@ Acceptable time formats include "minutes", "minutes:seconds",
 "hours:minutes:seconds", "days\-hours", "days\-hours:minutes" and 
 "days\-hours:minutes:seconds".
 
-.TP
-\fB\-\-task\-mem\fR[=]<\fIMB\fR>
-Mimimum memory available per task in MegaBytes.
-Default value is \fBDefMemPerTask\fR and the maximum value is
-\fBMaxMemPerTask\fR, both of which can be seen using the
-\fBscontrol show config\fR command.
-
 .TP
 \fB\-\-tmp\fR[=]<\fIMB\fR>
 Specify a minimum amount of temporary disk space.
@@ -709,6 +714,7 @@ salloc \-N5 srun \-n10 myprogram
 
 .SH "COPYING"
 Copyright (C) 2006\-2007 The Regents of the University of California.
+Copyright (C) 2008 Lawrence Livermore National Security.
 Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 LLNL\-CODE\-402394.
 .LP
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index 03accb92bfe..918f7843130 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -1,4 +1,4 @@
-.TH "sbatch" "1" "SLURM 1.3" "May 2008" "SLURM Commands"
+.TH "sbatch" "1" "SLURM 1.3" "July 2008" "SLURM Commands"
 .SH "NAME"
 .LP 
 sbatch \- Submit a batch script to SLURM.
@@ -330,11 +330,24 @@ The default value is the username of the submitting user.
 .TP
 \fB\-\-mem\fR[=]<\fIMB\fR>
 Specify the real memory required per node in MegaBytes.
-If a value is specified, that quantity of memory will be 
-reserved for this job. 
-If no value is specified and real memory is exhausted on 
-any allocated node then the job is subject to cancellation.
-Also see \fB\-\-task\-mem\fR.
+Default value is \fBDefMemPerNode\fR and the maximum value is
+\fBMaxMemPerNode\fR. If configured, both of parameters can be
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of whole nodes
+are allocated to jobs (\fBSelectType=select/linear\fR).
+Also see \fB\-\-mem\-per\-cpu\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
+
+.TP
+\fB\-\-mem\-per\-cpu\fR[=]<\fIMB\fR>
+Mimimum memory required per allocated CPU in MegaBytes.
+Default value is \fBDefMemPerCPU\fR and the maximum value is
+\fBMaxMemPerCPU\fR. If configured, both of parameters can be 
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of individual processors
+are allocated to jobs (\fBSelectType=select/cons_res\fR).
+Also see \fB\-\-mem\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
 
 .TP
 \fB\-\-mincores\fR[=]<\fIn\fR>
@@ -582,13 +595,6 @@ Acceptable time formats include "minutes", "minutes:seconds",
 "hours:minutes:seconds", "days\-hours", "days\-hours:minutes" and 
 "days\-hours:minutes:seconds".
 
-.TP
-\fB\-\-task\-mem\fR[=]<\fIMB\fR>
-Mimimum memory available per task in MegaBytes.
-Default value is \fBDefMemPerTask\fR and the maximum value is
-\fBMaxMemPerTask\fR, both of which can be seen using the
-\fBscontrol show config\fR command.
-
 .TP
 \fB\-\-tasks\-per\-node\fR[=]<\fIn\fR>
 Specify the number of tasks to be launched per node.
@@ -867,6 +873,7 @@ host4
 
 .SH "COPYING"
 Copyright (C) 2006\-2007 The Regents of the University of California.
+Copyright (C) 2008 Lawrence Livermore National Security.
 Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 LLNL\-CODE\-402394.
 .LP
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index e7c755e885d..5aca020e019 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -1,6 +1,4 @@
-.\" $Id$
-.\"
-.TH SRUN "1" "May 2008" "srun 1.3" "slurm components"
+.TH SRUN "1" "July 2008" "srun 1.3" "slurm components"
 
 .SH "NAME"
 srun \- run parallel jobs
@@ -425,11 +423,24 @@ The default value is the submitting user.
 .TP
 \fB\-\-mem\fR[=]<\fIMB\fR>
 Specify the real memory required per node in MegaBytes.
-If a value is specified, that quantity of memory will be 
-reserved for this job. 
-If no value is specified and real memory is exhausted on 
-any allocated node then the job is subject to cancellation.
-Also see \fB\-\-task\-mem\fR.
+Default value is \fBDefMemPerNode\fR and the maximum value is
+\fBMaxMemPerNode\fR. If configured, both of parameters can be
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of whole nodes
+are allocated to jobs (\fBSelectType=select/linear\fR).
+Also see \fB\-\-mem\-per\-cpu\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
+
+.TP
+\fB\-\-mem\-per\-cpu\fR[=]<\fIMB\fR>
+Mimimum memory required per allocated CPU in MegaBytes.
+Default value is \fBDefMemPerCPU\fR and the maximum value is
+\fBMaxMemPerCPU\fR. If configured, both of parameters can be 
+seen using the \fBscontrol show config\fR command.
+This parameter would generally be used of individual processors
+are allocated to jobs (\fBSelectType=select/cons_res\fR).
+Also see \fB\-\-mem\fR.
+\fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
 
 .TP
 \fB\-\-mem_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR
@@ -843,13 +854,6 @@ in slurm.conf is executed. This is meant to be a very short\-lived
 program. If it fails to terminate within a few seconds, it will 
 be killed along with any descendant processes.
 
-.TP
-\fB\-\-task\-mem\fR[=]<\fIMB\fR>
-Mimimum memory available per task in MegaBytes.
-Default value is \fBDefMemPerTask\fR and the maximum value is
-\fBMaxMemPerTask\fR, both of which can be seen using the
-\fBscontrol show config\fR command.
-
 .TP
 \fB\-\-task\-prolog\fR=\fIexecutable\fR
 The \fBslurmd\fR daemon will run \fIexecutable\fR just before launching 
@@ -1624,6 +1628,7 @@ wait
 
 .SH "COPYING"
 Copyright (C) 2006\-2007 The Regents of the University of California.
+Copyright (C) 2008 Lawrence Livermore National Security.
 Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 LLNL\-CODE\-402394.
 .LP
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 1d908839d36..6e625fa804f 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -1,4 +1,4 @@
-.TH "slurm.conf" "5" "June 2008" "slurm.conf 1.3" "Slurm configuration file"
+.TH "slurm.conf" "5" "July 2008" "slurm.conf 1.3" "Slurm configuration file"
 
 .SH "NAME"
 slurm.conf \- Slurm configuration file 
@@ -208,11 +208,25 @@ License (GPL).
 The default value is "crypto/openssl".
 
 .TP
-\fBDefMemPerTask\fR
-Default real memory size available per task in MegaBytes. 
+\fBDefMemPerCPU\fR
+Default real memory size available per allocated CPU in MegaBytes. 
 Used to avoid over\-subscribing memory and causing paging.
-Also see \fBMaxMemPerTask\fR.
+\fBDefMemPerCPU\fR would generally be used if individual processors
+are alocated to jobs (\fBSelectType=select/cons_res\fR). 
 The default value is 0 (unlimited).
+Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR.
+\fBDefMemPerCPU\fR and \fBDefMemPerNode\fR are mutually exclusive.
+
+.TP
+\fBDefMemPerNode\fR
+Default real memory size available per allocated node in MegaBytes.
+Used to avoid over\-subscribing memory and causing paging.
+\fBDefMemPerNode\fR would generally be used if whole nodes
+are alocated to jobs (\fBSelectType=select/linear\fR) and 
+resources are shared (\fBShared=yes\fR or \fBShared=force\fR).
+The default value is 0 (unlimited).
+Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR.
+\fBDefMemPerCPU\fR and \fBDefMemPerNode\fR are mutually exclusive.
 
 .TP
 \fBDefaultStorageHost\fR
@@ -525,11 +539,25 @@ of the slurmctld daemon.
 May not exceed 65533.
 
 .TP
-\fBMaxMemPerTask\fR
-Maximum real memory size available per task in MegaBytes. 
+\fBMaxMemPerCPU\fR
+Maximum real memory size available per allocated CPU in MegaBytes. 
+Used to avoid over\-subscribing memory and causing paging.
+\fBMaxMemPerCPU\fR would generally be used if individual processors
+are alocated to jobs (\fBSelectType=select/cons_res\fR).
+The default value is 0 (unlimited).
+Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR.
+\fBMaxMemPerCPU\fR and \fBMaxMemPerNode\fR are mutually exclusive.
+
+.TP
+\fBMaxMemPerNode\fR
+Maximum real memory size available per allocated node in MegaBytes.
 Used to avoid over\-subscribing memory and causing paging.
-Also see \fBDefMemPerTask\fR.
+\fBMaxMemPerNode\fR would generally be used if whole nodes
+are alocated to jobs (\fBSelectType=select/linear\fR) and
+resources are shared (\fBShared=yes\fR or \fBShared=force\fR).
 The default value is 0 (unlimited).
+Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR.
+\fBMaxMemPerCPU\fR and \fBMaxMemPerNode\fR are mutually exclusive.
 
 .TP
 \fBMessageTimeout\fR
@@ -835,22 +863,26 @@ On single\-core systems, each CPUs will be considered a CPU.
 .TP
 \fBCR_CPU_Memory\fR
 CPUs and memory are consumable resources.
+Setting a value for \fBDefMemPerCPU\fR is strongly recommended.
 .TP
 \fBCR_Core\fR
 Cores are consumable resources.
 .TP
 \fBCR_Core_Memory\fR
 Cores and memory are consumable resources.
+Setting a value for \fBDefMemPerCPU\fR is strongly recommended.
 .TP
 \fBCR_Socket\fR
 Sockets are consumable resources.
 .TP
 \fBCR_Socket_Memory\fR
 Memory and CPUs are consumable resources.
+Setting a value for \fBDefMemPerCPU\fR is strongly recommended.
 .TP
 \fBCR_Memory\fR
 Memory is a consumable resource.
 NOTE: This implies \fIShared=YES\fR or \fIShared=FORCE\fR for all partitions.
+Setting a value for \fBDefMemPerCPU\fR is strongly recommended.
 .RE
 
 .TP
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 775a449d469..ba543b016cf 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -448,6 +448,7 @@ typedef enum select_type_plugin_info {
 #define TASK_PARAM_CPUSETS 0x0001
 #define TASK_PARAM_SCHED   0x0002
 
+#define MEM_PER_CPU  0x80000000
 #define SHARED_FORCE 0x8000
 
 /*****************************************************************************\
@@ -528,7 +529,9 @@ typedef struct job_descriptor {	/* For submit, allocate, and update requests */
 	uint16_t job_min_sockets;  /* minimum sockets per node, default=0 */
 	uint16_t job_min_cores;    /* minimum cores per processor, default=0 */
 	uint16_t job_min_threads;  /* minimum threads per core, default=0 */
-	uint32_t job_min_memory;   /* minimum real memory per node, default=0 */
+	uint32_t job_min_memory;   /* minimum real memory per node OR 
+				    * real memory per CPU | MEM_PER_CPU,
+				    * default=0 (no limit) */
 	uint32_t job_min_tmp_disk; /* minimum tmp disk per node, default=0 */
 	uint32_t num_procs;	/* total count of processors required, 
 				 * default=0 */
diff --git a/src/api/config_info.c b/src/api/config_info.c
index db17884c6c4..f1867c87ae1 100644
--- a/src/api/config_info.c
+++ b/src/api/config_info.c
@@ -153,11 +153,15 @@ void slurm_print_ctl_conf ( FILE* out,
 		slurm_ctl_conf_ptr->control_machine);
 	fprintf(out, "CryptoType              = %s\n",
 		slurm_ctl_conf_ptr->crypto_type);
-	if (slurm_ctl_conf_ptr->def_mem_per_task) {
-		fprintf(out, "DefMemPerTask           = %u\n",
+	if (slurm_ctl_conf_ptr->def_mem_per_task & MEM_PER_CPU) {
+		fprintf(out, "DefMemPerCPU            = %u\n",
+			slurm_ctl_conf_ptr->def_mem_per_task &
+			(~MEM_PER_CPU));
+	} else if (slurm_ctl_conf_ptr->def_mem_per_task) {
+		fprintf(out, "DefMemPerNode           = %u\n",
 			slurm_ctl_conf_ptr->def_mem_per_task);
 	} else
-		fprintf(out, "DefMemPerTask           = UNLIMITED\n");
+		fprintf(out, "DefMemPerCPU            = UNLIMITED\n");
 	if (slurm_ctl_conf_ptr->disable_root_jobs)
 		fprintf(out, "DisableRootJobs         = YES\n");
 	else
@@ -220,11 +224,15 @@ void slurm_print_ctl_conf ( FILE* out,
 		slurm_ctl_conf_ptr->mail_prog);
 	fprintf(out, "MaxJobCount             = %u\n", 
 		slurm_ctl_conf_ptr->max_job_cnt);
-	if (slurm_ctl_conf_ptr->max_mem_per_task) {
-		fprintf(out, "MaxMemPerTask           = %u\n",
+	if (slurm_ctl_conf_ptr->max_mem_per_task & MEM_PER_CPU) {
+		fprintf(out, "MaxMemPerCPU            = %u\n",
+			slurm_ctl_conf_ptr->max_mem_per_task &
+			(~MEM_PER_CPU));
+	} else if (slurm_ctl_conf_ptr->max_mem_per_task) {
+		fprintf(out, "MaxMemPerNode           = %u\n",
 			slurm_ctl_conf_ptr->max_mem_per_task);
 	} else
-		fprintf(out, "MaxMemPerTask           = UNLIMITED\n");
+		fprintf(out, "MaxMemPerCPU            = UNLIMITED\n");
 	fprintf(out, "MessageTimeout          = %u\n",
 		slurm_ctl_conf_ptr->msg_timeout);
 	fprintf(out, "MinJobAge               = %u\n", 
diff --git a/src/api/init_msg.c b/src/api/init_msg.c
index abb3d973d0f..333752b31d3 100644
--- a/src/api/init_msg.c
+++ b/src/api/init_msg.c
@@ -1,8 +1,8 @@
 /*****************************************************************************\
  *  init_msg.c - initialize RPC messages contents
- *  $Id$
  *****************************************************************************
- *  Copyright (C) 2002-2006 The Regents of the University of California.
+ *  Copyright (C) 2002-2007 The Regents of the University of California.
+ *  Copyright (C) 2008 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>.
  *  LLNL-CODE-402394.
@@ -55,71 +55,25 @@
  */
 void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg)
 {
-	job_desc_msg->account     = NULL;
-	job_desc_msg->acctg_freq  = (uint16_t) NO_VAL;
-	job_desc_msg->alloc_node  = NULL;
-	job_desc_msg->alloc_sid   = NO_VAL;
-	job_desc_msg->comment     = NULL;
-	job_desc_msg->contiguous  = (uint16_t) NO_VAL;
-	job_desc_msg->cpus_per_task = (uint16_t) NO_VAL;
-	job_desc_msg->ntasks_per_node   = (uint16_t) NO_VAL;
-	job_desc_msg->ntasks_per_socket = (uint16_t) NO_VAL;
-	job_desc_msg->ntasks_per_core   = (uint16_t) NO_VAL;
-	job_desc_msg->dependency  = NULL;
-	job_desc_msg->environment = ((char **) NULL);
-	job_desc_msg->env_size    = 0;
-	job_desc_msg->features    = NULL;
-	job_desc_msg->immediate   = 0;
-	job_desc_msg->job_id      = NO_VAL;
-	job_desc_msg->job_min_cores   = (uint16_t) NO_VAL;
-	job_desc_msg->job_min_procs   = (uint16_t) NO_VAL;
-	job_desc_msg->job_min_sockets = (uint16_t) NO_VAL;
-	job_desc_msg->job_min_threads = (uint16_t) NO_VAL;
-	job_desc_msg->job_min_memory  = NO_VAL;
-	job_desc_msg->job_min_tmp_disk= NO_VAL;
-	job_desc_msg->kill_on_node_fail = (uint16_t) NO_VAL;
-	job_desc_msg->licenses    = NULL;
-	job_desc_msg->name        = NULL;
-	job_desc_msg->network     = NULL;
-	job_desc_msg->nice        = NICE_OFFSET;
-	job_desc_msg->ntasks_per_core   = (uint16_t) NO_VAL;
-	job_desc_msg->ntasks_per_node   = (uint16_t) NO_VAL;
-	job_desc_msg->ntasks_per_socket = (uint16_t) NO_VAL;
-	job_desc_msg->num_tasks   = NO_VAL;
-	job_desc_msg->open_mode   = 0;	/* system default */
-	job_desc_msg->overcommit  = (uint8_t) NO_VAL;
-	job_desc_msg->partition   = NULL;
-	job_desc_msg->plane_size  = (uint16_t) NO_VAL;
-	job_desc_msg->priority    = NO_VAL;
-	job_desc_msg->req_nodes   = NULL;
-	job_desc_msg->exc_nodes   = NULL;
-	job_desc_msg->script      = NULL;
-	job_desc_msg->argv        = ((char **) NULL);
-	job_desc_msg->argc        = 0;
-	job_desc_msg->shared      = (uint16_t) NO_VAL;
-	job_desc_msg->task_dist   = (uint16_t) NO_VAL;
-	job_desc_msg->time_limit  = NO_VAL;
-	job_desc_msg->num_procs   = NO_VAL;
-	job_desc_msg->max_nodes   = NO_VAL;
-	job_desc_msg->min_nodes   = NO_VAL;
-	job_desc_msg->max_sockets = (uint16_t) NO_VAL;
-	job_desc_msg->min_sockets = (uint16_t) NO_VAL;
-	job_desc_msg->max_cores   = (uint16_t) NO_VAL;
-	job_desc_msg->min_cores   = (uint16_t) NO_VAL;
-	job_desc_msg->max_threads = (uint16_t) NO_VAL;
-	job_desc_msg->min_threads = (uint16_t) NO_VAL;
-	job_desc_msg->err         = NULL;
-	job_desc_msg->in          = NULL;
-	job_desc_msg->out         = NULL;
-	job_desc_msg->user_id     = NO_VAL;
-	job_desc_msg->group_id    = NO_VAL;
-	job_desc_msg->work_dir    = NULL;
-	job_desc_msg->alloc_resp_port = 0;
-	job_desc_msg->other_port  = 0;
-	job_desc_msg->mail_type   = 0;
-	job_desc_msg->mail_user   = NULL;
-	job_desc_msg->begin_time  = 0;
-	job_desc_msg->requeue  = (uint16_t) NO_VAL;
+	job_desc_msg->account		= NULL;
+	job_desc_msg->acctg_freq	= (uint16_t) NO_VAL;
+	job_desc_msg->alloc_node	= NULL;
+	job_desc_msg->alloc_resp_port	= 0;
+	job_desc_msg->alloc_sid		= NO_VAL;
+	job_desc_msg->argc		= 0;
+	job_desc_msg->argv		= ((char **) NULL);
+	job_desc_msg->begin_time	= 0;
+	job_desc_msg->blrtsimage	= NULL;
+	job_desc_msg->comment		= NULL;
+	job_desc_msg->conn_type		= (uint16_t) NO_VAL;
+	job_desc_msg->contiguous	= (uint16_t) NO_VAL;
+	job_desc_msg->cpus_per_task	= (uint16_t) NO_VAL;
+	job_desc_msg->dependency	= NULL;
+	job_desc_msg->environment	= ((char **) NULL);
+	job_desc_msg->env_size		= 0;
+	job_desc_msg->err		= NULL;
+	job_desc_msg->exc_nodes		= NULL;
+	job_desc_msg->features		= NULL;
 #if SYSTEM_DIMENSIONS
 {
 	int i;
@@ -127,14 +81,58 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg)
 		job_desc_msg->geometry[i] = (uint16_t) NO_VAL;
 }
 #endif
-	job_desc_msg->conn_type   = (uint16_t) NO_VAL;
-	job_desc_msg->reboot      = (uint16_t) NO_VAL;
-	job_desc_msg->rotate      = (uint16_t) NO_VAL;
-	job_desc_msg->blrtsimage = NULL;
-	job_desc_msg->linuximage = NULL;
-	job_desc_msg->mloaderimage = NULL;
-	job_desc_msg->ramdiskimage = NULL;
-	job_desc_msg->select_jobinfo = NULL;
+	job_desc_msg->group_id		= NO_VAL;
+	job_desc_msg->immediate		= 0;
+	job_desc_msg->in		= NULL;
+	job_desc_msg->job_id		= NO_VAL;
+	job_desc_msg->job_min_cores	= (uint16_t) NO_VAL;
+	job_desc_msg->job_min_procs	= (uint16_t) NO_VAL;
+	job_desc_msg->job_min_sockets	= (uint16_t) NO_VAL;
+	job_desc_msg->job_min_threads	= (uint16_t) NO_VAL;
+	job_desc_msg->job_min_memory    = NO_VAL;
+	job_desc_msg->job_min_tmp_disk  = NO_VAL;
+	job_desc_msg->kill_on_node_fail = (uint16_t) NO_VAL;
+	job_desc_msg->licenses		= NULL;
+	job_desc_msg->linuximage	= NULL;
+	job_desc_msg->mail_type		= 0;
+	job_desc_msg->mail_user		= NULL;
+	job_desc_msg->max_cores		= (uint16_t) NO_VAL;
+	job_desc_msg->max_nodes		= NO_VAL;
+	job_desc_msg->max_sockets	= (uint16_t) NO_VAL;
+	job_desc_msg->max_threads	= (uint16_t) NO_VAL;
+	job_desc_msg->min_cores		= (uint16_t) NO_VAL;
+	job_desc_msg->min_nodes		= NO_VAL;
+	job_desc_msg->min_sockets	= (uint16_t) NO_VAL;
+	job_desc_msg->min_threads	= (uint16_t) NO_VAL;
+	job_desc_msg->mloaderimage	= NULL;
+	job_desc_msg->name		= NULL;
+	job_desc_msg->network		= NULL;
+	job_desc_msg->nice		= NICE_OFFSET;
+	job_desc_msg->ntasks_per_core	= (uint16_t) NO_VAL;
+	job_desc_msg->ntasks_per_node	= (uint16_t) NO_VAL;
+	job_desc_msg->ntasks_per_socket	= (uint16_t) NO_VAL;
+	job_desc_msg->num_procs		= NO_VAL;
+	job_desc_msg->num_tasks		= NO_VAL;
+	job_desc_msg->open_mode		= 0;	/* system default */
+	job_desc_msg->other_port	= 0;
+	job_desc_msg->out		= NULL;
+	job_desc_msg->overcommit	= (uint8_t) NO_VAL;
+	job_desc_msg->partition		= NULL;
+	job_desc_msg->plane_size	= (uint16_t) NO_VAL;
+	job_desc_msg->priority		= NO_VAL;
+	job_desc_msg->ramdiskimage	= NULL;
+	job_desc_msg->reboot		= (uint16_t) NO_VAL;
+	job_desc_msg->resp_host		= NULL;
+	job_desc_msg->req_nodes		= NULL;
+	job_desc_msg->requeue		= (uint16_t) NO_VAL;
+	job_desc_msg->rotate		= (uint16_t) NO_VAL;
+	job_desc_msg->script		= NULL;
+	job_desc_msg->select_jobinfo	= NULL;
+	job_desc_msg->shared		= (uint16_t) NO_VAL;
+	job_desc_msg->task_dist		= (uint16_t) NO_VAL;
+	job_desc_msg->time_limit	= NO_VAL;
+	job_desc_msg->user_id		= NO_VAL;
+	job_desc_msg->work_dir		= NULL;
 }
 
 /*
diff --git a/src/api/job_info.c b/src/api/job_info.c
index 6a53047bd4e..f44ed7db997 100644
--- a/src/api/job_info.c
+++ b/src/api/job_info.c
@@ -371,13 +371,18 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
 		xstrcat(out, "\n   ");
 
 	/****** Line 10 ******/
+	if (job_ptr->job_min_memory & MEM_PER_CPU) {
+		job_ptr->job_min_memory &= (~MEM_PER_CPU);
+		tmp3_ptr = "CPU";
+	} else
+		tmp3_ptr = "Node";
 	convert_num_unit((float)job_ptr->job_min_memory, tmp1, sizeof(tmp1),
 			 UNIT_NONE);
 	convert_num_unit((float)job_ptr->job_min_tmp_disk, tmp2, sizeof(tmp2),
 			 UNIT_NONE);
 	snprintf(tmp_line, sizeof(tmp_line), 
-		"MinMemory=%s MinTmpDisk=%s Features=%s",
-		tmp1, tmp2, job_ptr->features);
+		"MinMemory%s=%s MinTmpDisk=%s Features=%s",
+		tmp3_ptr, tmp1, tmp2, job_ptr->features);
 	xstrcat(out, tmp_line);
 	if (one_liner)
 		xstrcat(out, " ");
diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c
index cfbafbf6362..4bbcd2da4af 100644
--- a/src/api/step_ctx.c
+++ b/src/api/step_ctx.c
@@ -58,12 +58,14 @@ static void
 _job_fake_cred(struct slurm_step_ctx_struct *ctx)
 {
 	slurm_cred_arg_t arg;
-	arg.jobid    = ctx->job_id;
-	arg.stepid   = ctx->step_resp->job_step_id;
-	arg.uid      = ctx->user_id;
-	arg.hostlist = ctx->step_req->node_list;
-        arg.alloc_lps_cnt = 0;    
-        arg.alloc_lps     =  NULL; 
+	arg.alloc_lps_cnt = 0;
+	arg.alloc_lps     = NULL;
+	arg.hostlist      = ctx->step_req->node_list;
+	arg.job_mem       = 0;
+	arg.jobid         = ctx->job_id;
+	arg.stepid        = ctx->step_resp->job_step_id;
+	arg.task_mem      = 0;
+	arg.uid           = ctx->user_id;
 	ctx->step_resp->cred = slurm_cred_faker(&arg);
 }
 
diff --git a/src/common/read_config.c b/src/common/read_config.c
index 724caa3e10b..dfcf0ffd08d 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -146,7 +146,9 @@ s_p_options_t slurm_conf_options[] = {
 	{"DefaultStoragePort", S_P_UINT32},
 	{"DefaultStorageType", S_P_STRING},
 	{"DefaultStorageUser", S_P_STRING},
-	{"DefMemPerTask", S_P_UINT32},
+	{"DefMemPerCPU", S_P_UINT32},
+	{"DefMemPerNode", S_P_UINT32},
+	{"DefMemPerTask", S_P_UINT32},	/* defunct */
 	{"DisableRootJobs", S_P_BOOLEAN},
 	{"EnforcePartLimits", S_P_BOOLEAN},
 	{"Epilog", S_P_STRING},
@@ -179,7 +181,9 @@ s_p_options_t slurm_conf_options[] = {
 	{"Licenses", S_P_STRING},
 	{"MailProg", S_P_STRING},
 	{"MaxJobCount", S_P_UINT16},
-	{"MaxMemPerTask", S_P_UINT32},
+	{"MaxMemPerCPU", S_P_UINT32},
+	{"MaxMemPerNode", S_P_UINT32},
+	{"MaxMemPerTask", S_P_UINT32},	/* defunct */
 	{"MessageTimeout", S_P_UINT16},
 	{"MinJobAge", S_P_UINT16},
 	{"MpichGmDirectSupport", S_P_LONG, defunct_option},
@@ -1551,7 +1555,7 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 	s_p_get_string(&default_storage_host, "DefaultStorageHost", hashtbl);
 	s_p_get_string(&default_storage_user, "DefaultStorageUser", hashtbl);
 	s_p_get_string(&default_storage_pass, "DefaultStoragePass", hashtbl);
-	s_p_get_string(&default_storage_loc, "DefaultStorageLoc", hashtbl);
+	s_p_get_string(&default_storage_loc,  "DefaultStorageLoc", hashtbl);
 	s_p_get_uint32(&default_storage_port, "DefaultStoragePort", hashtbl);
 
 	if (!s_p_get_string(&conf->job_credential_private_key,
@@ -1577,8 +1581,11 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 	if (!s_p_get_string(&conf->crypto_type, "CryptoType", hashtbl))
 		 conf->crypto_type = xstrdup(DEFAULT_CRYPTO_TYPE);
 
-	if (!s_p_get_uint32(&conf->def_mem_per_task, "DefMemPerTask", hashtbl))
-		conf->def_mem_per_task = DEFAULT_MEM_PER_TASK;
+	if ((s_p_get_uint32(&conf->def_mem_per_task, "DefMemPerCPU", hashtbl)) ||
+	    (s_p_get_uint32(&conf->def_mem_per_task, "DefMemPerTask", hashtbl)))
+		conf->def_mem_per_task |= MEM_PER_CPU;
+	else if (!s_p_get_uint32(&conf->def_mem_per_task, "DefMemPerNode", hashtbl))
+		conf->def_mem_per_task = DEFAULT_MEM_PER_CPU;
 
 	if (!s_p_get_boolean((bool *) &conf->disable_root_jobs, 
 			     "DisableRootJobs", hashtbl))
@@ -1708,8 +1715,11 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 	if (!s_p_get_uint16(&conf->max_job_cnt, "MaxJobCount", hashtbl))
 		conf->max_job_cnt = DEFAULT_MAX_JOB_COUNT;
 
-	if (!s_p_get_uint32(&conf->max_mem_per_task, "MaxMemPerTask", hashtbl))
-		conf->max_mem_per_task = DEFAULT_MAX_MEM_PER_TASK;
+	if ((s_p_get_uint32(&conf->max_mem_per_task, "MaxMemPerCPU", hashtbl)) ||
+	    (s_p_get_uint32(&conf->max_mem_per_task, "MaxMemPerTask", hashtbl)))
+		conf->max_mem_per_task |= MEM_PER_CPU;
+	else if (!s_p_get_uint32(&conf->max_mem_per_task, "MaxMemPerNode", hashtbl))
+		conf->max_mem_per_task = DEFAULT_MAX_MEM_PER_CPU;
 
 	if (!s_p_get_uint16(&conf->msg_timeout, "MessageTimeout", hashtbl))
 		conf->msg_timeout = DEFAULT_MSG_TIMEOUT;
diff --git a/src/common/read_config.h b/src/common/read_config.h
index cddc90068df..c64361b4ccc 100644
--- a/src/common/read_config.h
+++ b/src/common/read_config.h
@@ -73,8 +73,8 @@ extern char *default_plugstack;
 #define DEFAULT_KILL_WAIT           30
 #define DEFAULT_MAIL_PROG           "/bin/mail"
 #define DEFAULT_MAX_JOB_COUNT       5000
-#define DEFAULT_MEM_PER_TASK        0
-#define DEFAULT_MAX_MEM_PER_TASK    0
+#define DEFAULT_MEM_PER_CPU         0
+#define DEFAULT_MAX_MEM_PER_CPU     0
 #define DEFAULT_MIN_JOB_AGE         300
 #define DEFAULT_MPI_DEFAULT         "none"
 #define DEFAULT_MSG_TIMEOUT         10
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index d73c11b623c..26be1838eeb 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -599,6 +599,17 @@ static uint16_t _get_task_count(struct node_cr_record *select_node_ptr,
 					 &threads, alloc_cores, 
 					 cr_type, job_ptr->job_id,
 					 this_node->node_ptr->name);
+
+	if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+		uint32_t free_mem, mem_per_cpu;
+		int max_cpus;
+		mem_per_cpu = job_ptr->details->job_min_memory & (~MEM_PER_CPU);
+		free_mem = this_node->real_memory - this_node->alloc_memory;
+		max_cpus = free_mem / mem_per_cpu;
+		/* info("cpus avail:%d  mem for %d", numtasks, max_cpus); */
+		numtasks = MIN(numtasks, max_cpus);
+	}
+
 #if (CR_DEBUG)
 	info("cons_res: _get_task_count computed a_tasks %d s %d c %d "
 		"t %d on %s for job %d",
@@ -1992,8 +2003,9 @@ static int _verify_node_state(struct node_cr_record *select_node_ptr,
 			      enum node_cr_state job_node_req)
 {
 	int i;
-	uint32_t free_mem;
+	uint32_t free_mem, min_mem;
 
+	min_mem = job_ptr->details->job_min_memory & (~MEM_PER_CPU);
 	for (i = 0; i < select_node_cnt; i++) {
 		if (!bit_test(bitmap, i))
 			continue;
@@ -2003,7 +2015,7 @@ static int _verify_node_state(struct node_cr_record *select_node_ptr,
 		     (cr_type == CR_MEMORY) || (cr_type == CR_SOCKET_MEMORY))) {
 			free_mem = select_node_ptr[i].real_memory;
 			free_mem -= select_node_ptr[i].alloc_memory;
-			if (free_mem < job_ptr->details->job_min_memory)
+			if (free_mem < min_mem)
 				goto clear_bit;
 		}
 
@@ -2589,9 +2601,6 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			job->cpus[j] = 0;
 		}
 		job->alloc_cpus[j] = 0;
-		if ((cr_type == CR_CORE_MEMORY) || (cr_type == CR_CPU_MEMORY) ||
-		    (cr_type == CR_MEMORY) || (cr_type == CR_SOCKET_MEMORY))
-			job->alloc_memory[j] = job_ptr->details->job_min_memory; 
 		if ((cr_type == CR_CORE) || (cr_type == CR_CORE_MEMORY)||
 		    (cr_type == CR_SOCKET) || (cr_type == CR_SOCKET_MEMORY)) {
 			_chk_resize_job(job, j, job->num_sockets[j]);
@@ -2652,6 +2661,26 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		return error_code;
 	}
 
+	if (job_ptr->details->job_min_memory &&
+	    ((cr_type == CR_CORE_MEMORY) || (cr_type == CR_CPU_MEMORY) ||
+	     (cr_type == CR_MEMORY) || (cr_type == CR_SOCKET_MEMORY))) {
+		j = 0;
+		for (i = 0; i < node_record_count; i++) {
+			if (bit_test(bitmap, i) == 0)
+				continue;
+			if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+				job->alloc_memory[j] = job_ptr->details->
+						       job_min_memory &
+						       (~MEM_PER_CPU);
+				job->alloc_memory[j] *= job->alloc_cpus[j];
+			} else {
+				job->alloc_memory[j] = job_ptr->details->
+						       job_min_memory;
+			}
+			j++;
+		}
+	}
+
 	_append_to_job_list(job);
 	last_cr_update_time = time(NULL);
 
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index f2106500255..787580d21ae 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -2,8 +2,6 @@
  *  select_linear.c - node selection plugin for simple one-dimensional 
  *  address space. Selects nodes for a job so as to minimize the number 
  *  of sets of consecutive nodes using a best-fit algorithm.
- *
- *  $Id$
  *****************************************************************************
  *  Copyright (C) 2004-2007 The Regents of the University of California.
  *  Copyright (C) 2008 Lawrence Livermore National Security.
@@ -559,7 +557,7 @@ static int _job_count_bitmap(struct node_cr_record *node_cr_ptr,
 {
 	int i, count = 0, total_jobs, total_run_jobs;
 	struct part_cr_record *part_cr_ptr;
-	uint32_t job_memory = 0;
+	uint32_t job_memory_cpu = 0, job_memory_node = 0;
 	bool exclusive;
 
 	xassert(node_cr_ptr);
@@ -572,24 +570,42 @@ static int _job_count_bitmap(struct node_cr_record *node_cr_ptr,
 	else
 		exclusive = true;
 
-	if (job_ptr->details->job_min_memory  && (cr_type == CR_MEMORY))
-		job_memory = job_ptr->details->job_min_memory;
+	if (job_ptr->details->job_min_memory  && (cr_type == CR_MEMORY)) {
+		if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+			job_memory_cpu = job_ptr->details->job_min_memory &
+					 (~MEM_PER_CPU);
+		} else
+			job_memory_node = job_ptr->details->job_min_memory;
+	}
 
 	for (i = 0; i < node_record_count; i++) {
 		if (!bit_test(bitmap, i)) {
 			bit_clear(jobmap, i);
 			continue;
 		}
-
-		if (select_fast_schedule) {
-			if ((node_cr_ptr[i].alloc_memory + job_memory) >
-			     node_record_table_ptr[i].config_ptr->real_memory) {
-				bit_clear(jobmap, i);
-				continue;
+		if (job_memory_cpu || job_memory_node) {
+			uint32_t alloc_mem, job_mem, avail_mem;
+			alloc_mem = node_cr_ptr[i].alloc_memory;
+			if (select_fast_schedule) {
+				avail_mem = node_record_table_ptr[i].
+					    config_ptr->real_memory;
+				if (job_memory_cpu) {
+					job_mem = job_memory_cpu *
+						  node_record_table_ptr[i].
+						  config_ptr->cpus;
+				} else
+					job_mem = job_memory_node;
+			} else {
+				avail_mem = node_record_table_ptr[i].
+					    real_memory;
+				if (job_memory_cpu) {
+					job_mem = job_memory_cpu *
+						  node_record_table_ptr[i].
+						  cpus;
+				} else
+					job_mem = job_memory_node;
 			}
-		} else {
-			if ((node_cr_ptr[i].alloc_memory + job_memory) >
-			     node_record_table_ptr[i].real_memory) {
+			if ((alloc_mem + job_mem) >avail_mem) {
 				bit_clear(jobmap, i);
 				continue;
 			}
@@ -1132,7 +1148,7 @@ static int _rm_job_from_nodes(struct node_cr_record *node_cr_ptr,
 {
 	int i, rc = SLURM_SUCCESS;
 	struct part_cr_record *part_cr_ptr;
-	uint32_t job_memory = 0;
+	uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
 
 	if (node_cr_ptr == NULL) {
 		error("%s: node_cr_ptr not initialized", pre_err);
@@ -1140,12 +1156,27 @@ static int _rm_job_from_nodes(struct node_cr_record *node_cr_ptr,
 	}
 
 	if (remove_all && job_ptr->details && 
-	    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY))
-		job_memory = job_ptr->details->job_min_memory;
+	    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY)) {
+		if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+			job_memory_cpu = job_ptr->details->job_min_memory &
+					 (~MEM_PER_CPU);
+		} else
+			job_memory_node = job_ptr->details->job_min_memory;
+	}
 
 	for (i = 0; i < select_node_cnt; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
+		if (job_memory_cpu == 0)
+			job_memory = job_memory_node;
+		else if (select_fast_schedule) {
+			job_memory = job_memory_cpu *
+				     node_record_table_ptr[i].
+				     config_ptr->cpus;
+		} else {
+			job_memory = job_memory_cpu *
+				     node_record_table_ptr[i].cpus;
+		}
 		if (node_cr_ptr[i].alloc_memory >= job_memory)
 			node_cr_ptr[i].alloc_memory -= job_memory;
 		else {
@@ -1208,7 +1239,7 @@ static int _add_job_to_nodes(struct node_cr_record *node_cr_ptr,
 {
 	int i, rc = SLURM_SUCCESS, exclusive = 0;
 	struct part_cr_record *part_cr_ptr;
-	uint32_t job_memory = 0;
+	uint32_t job_memory_cpu = 0, job_memory_node = 0;
 
 	if (node_cr_ptr == NULL) {
 		error("%s: node_cr_ptr not initialized", pre_err);
@@ -1216,15 +1247,32 @@ static int _add_job_to_nodes(struct node_cr_record *node_cr_ptr,
 	}
 
 	if (alloc_all && job_ptr->details && 
-	    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY))
-		job_memory = job_ptr->details->job_min_memory;
+	    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY)) {
+		if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+			job_memory_cpu = job_ptr->details->job_min_memory &
+					 (~MEM_PER_CPU);
+		} else
+			job_memory_node = job_ptr->details->job_min_memory;
+	}
+
 	if (job_ptr->details->shared == 0)
 		exclusive = 1;
 
 	for (i = 0; i < select_node_cnt; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
-		node_cr_ptr[i].alloc_memory += job_memory;
+		if (job_memory_cpu == 0)
+			node_cr_ptr[i].alloc_memory += job_memory_node;
+		else if (select_fast_schedule) {
+			node_cr_ptr[i].alloc_memory += 
+					job_memory_cpu *
+					node_record_table_ptr[i].
+					config_ptr->cpus;
+		} else {
+			node_cr_ptr[i].alloc_memory += 
+					job_memory_cpu *
+					node_record_table_ptr[i].cpus;
+		}
 		if (exclusive) {
 			if (node_cr_ptr[i].exclusive_jobid) {
 				error("select/linear: conflicting exclusive "
@@ -1341,7 +1389,7 @@ static void _init_node_cr(void)
 	ListIterator part_iterator;
 	struct job_record *job_ptr;
 	ListIterator job_iterator;
-	uint32_t job_memory, step_mem;
+	uint32_t job_memory_cpu, job_memory_node, step_mem = 0;
 	int exclusive, i, node_inx;
 	ListIterator step_iterator;
 	struct step_record *step_ptr;
@@ -1375,11 +1423,17 @@ static void _init_node_cr(void)
 		    (job_ptr->job_state != JOB_SUSPENDED))
 			continue;
 
+		job_memory_cpu  = 0;
+		job_memory_node = 0;
 		if (job_ptr->details && 
-		    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY))
-			job_memory = job_ptr->details->job_min_memory;
-		else
-			job_memory = 0;
+		    job_ptr->details->job_min_memory && (cr_type == CR_MEMORY)) {
+			if (job_ptr->details->job_min_memory & MEM_PER_CPU) {
+				job_memory_cpu = job_ptr->details->job_min_memory &
+						 (~MEM_PER_CPU);
+			} else {
+				job_memory_node = job_ptr->details->job_min_memory;
+			}
+		}
 		if (job_ptr->details->shared == 0)
 			exclusive = 1;
 		else
@@ -1400,7 +1454,18 @@ static void _init_node_cr(void)
 				}
 				node_cr_ptr[i].exclusive_jobid = job_ptr->job_id;
 			}
-			node_cr_ptr[i].alloc_memory += job_memory;
+			if (job_memory_cpu == 0)
+				node_cr_ptr[i].alloc_memory += job_memory_node;
+			else if (select_fast_schedule) {
+				node_cr_ptr[i].alloc_memory += 
+						job_memory_cpu *
+						node_record_table_ptr[i].
+						config_ptr->cpus;
+			} else {
+				node_cr_ptr[i].alloc_memory += 
+						job_memory_cpu *
+						node_record_table_ptr[i].cpus;
+			}
 			part_cr_ptr = node_cr_ptr[i].parts;
 			while (part_cr_ptr) {
 				if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
diff --git a/src/salloc/opt.c b/src/salloc/opt.c
index 9e17c746bd6..a46937536c3 100644
--- a/src/salloc/opt.c
+++ b/src/salloc/opt.c
@@ -125,7 +125,7 @@
 #define LONG_OPT_NTASKSPERNODE   0x136
 #define LONG_OPT_NTASKSPERSOCKET 0x137
 #define LONG_OPT_NTASKSPERCORE   0x138
-#define LONG_OPT_TASK_MEM        0x13a
+#define LONG_OPT_MEM_PER_CPU     0x13a
 #define LONG_OPT_HINT            0x13b
 #define LONG_OPT_ACCTG_FREQ      0x13c
 
@@ -267,7 +267,7 @@ static void _opt_default()
 	opt.minsockets      = -1;
 	opt.mincores        = -1;
 	opt.minthreads      = -1;
-	opt.task_mem	    = -1;
+	opt.mem_per_cpu	    = -1;
 	opt.realmem	    = -1;
 	opt.tmpdisk	    = -1;
 
@@ -512,8 +512,9 @@ void set_options(const int argc, char **argv)
 		{"mincores",      required_argument, 0, LONG_OPT_MINCORES},
 		{"minthreads",    required_argument, 0, LONG_OPT_MINTHREADS},
 		{"mem",           required_argument, 0, LONG_OPT_MEM},
-		{"job-mem",       required_argument, 0, LONG_OPT_TASK_MEM},
-		{"task-mem",      required_argument, 0, LONG_OPT_TASK_MEM},
+		{"job-mem",       required_argument, 0, LONG_OPT_MEM_PER_CPU},
+		{"task-mem",      required_argument, 0, LONG_OPT_MEM_PER_CPU},
+		{"mem-per-cpu",   required_argument, 0, LONG_OPT_MEM_PER_CPU},
 		{"hint",          required_argument, 0, LONG_OPT_HINT},
 		{"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE},
 		{"cores-per-socket", required_argument, 0, LONG_OPT_CORESPERSOCKET},
@@ -761,9 +762,9 @@ void set_options(const int argc, char **argv)
 				exit(1);
 			}
 			break;
-		case LONG_OPT_TASK_MEM:
-			opt.task_mem = (int) str_to_bytes(optarg);
-			if (opt.task_mem < 0) {
+		case LONG_OPT_MEM_PER_CPU:
+			opt.mem_per_cpu = (int) str_to_bytes(optarg);
+			if (opt.mem_per_cpu < 0) {
 				error("invalid memory constraint %s", 
 				      optarg);
 				exit(1);
@@ -1015,15 +1016,11 @@ static bool _opt_verify(void)
 		verified = false;
 	}
 
-        /* When CR with memory as a CR is enabled we need to assign
-	 * adequate value or check the value to opt.mem */
-	if ((opt.realmem >= -1) && (opt.task_mem > 0)) {
-		if (opt.realmem == -1) {
-			opt.realmem = opt.task_mem;
-		} else if (opt.realmem < opt.task_mem) {
-			info("mem < task-mem - resizing mem to be equal "
-			     "to task-mem");
-			opt.realmem = opt.task_mem;
+	if ((opt.realmem > -1) && (opt.mem_per_cpu > -1)) {
+		if (opt.realmem < opt.mem_per_cpu) {
+			info("mem < mem-per-cpu - resizing mem to be equal "
+			     "to mem-per-cpu");
+			opt.realmem = opt.mem_per_cpu;
 		}
 	}
 	
@@ -1173,8 +1170,8 @@ static char *print_constraints()
 	if (opt.realmem > 0)
 		xstrfmtcat(buf, "mem=%dM ", opt.realmem);
 
-	if (opt.task_mem > 0)
-		xstrfmtcat(buf, "task-mem=%dM ", opt.task_mem);
+	if (opt.mem_per_cpu > 0)
+		xstrfmtcat(buf, "mem-per-cpu=%dM ", opt.mem_per_cpu);
 
 	if (opt.tmpdisk > 0)
 		xstrfmtcat(buf, "tmp=%ld ", opt.tmpdisk);
@@ -1353,7 +1350,7 @@ static void _usage(void)
 "              [--mail-type=type] [--mail-user=user][--nice[=value]]\n"
 "              [--bell] [--no-bell] [--kill-command[=signal]]\n"
 "              [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n"
-"              [--network=type]\n"
+"              [--network=type] [--mem-per-cpu=MB]\n"
 "              executable [args...]\n");
 }
 
@@ -1416,8 +1413,8 @@ static void _help(void)
 "Consumable resources related options:\n" 
 "      --exclusive             allocate nodes in exclusive mode when\n" 
 "                              cpu consumable resource is enabled\n"
-"      --task-mem=MB           maximum amount of real memory per task\n"
-"                              required by the job.\n" 
+"      --mem-per-cpu=MB        maximum amount of real memory per allocated\n"
+"                              cpu required by the job.\n" 
 "                              --mem >= --job-mem if --mem is specified.\n" 
 "\n"
 "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" 
diff --git a/src/salloc/opt.h b/src/salloc/opt.h
index 972444517d9..2ca869cc76a 100644
--- a/src/salloc/opt.h
+++ b/src/salloc/opt.h
@@ -107,7 +107,7 @@ typedef struct salloc_options {
 	int minsockets;		/* --minsockets=n		*/
 	int mincores;		/* --mincores=n			*/
 	int minthreads;		/* --minthreads=n		*/
-	int task_mem;		/* --task-mem=n			*/
+	int mem_per_cpu;	/* --mem_per_cpu=n		*/
 	int realmem;		/* --mem=n			*/
 	long tmpdisk;		/* --tmp=n			*/
 	char *constraints;	/* --constraints=, -C constraint*/
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index edb65cb94d7..79c5616b3c2 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -217,10 +217,6 @@ int main(int argc, char *argv[])
 		env_array_append_fmt(&env, "SLURM_ACCTG_FREQ", "%d",
 			opt.acctg_freq);
 	}
-	if (opt.task_mem >= 0) {
-		env_array_append_fmt(&env, "SLURM_TASK_MEM", "%d",
-			opt.task_mem);
-	}
 	if (opt.network)
 		env_array_append_fmt(&env, "SLURM_NETWORK", "%s", opt.network);
 	env_array_set_environment(env);
@@ -360,6 +356,8 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc)
 		desc->job_min_threads = opt.minthreads;
 	if (opt.realmem > -1)
 		desc->job_min_memory = opt.realmem;
+	else if (opt.mem_per_cpu > -1)
+		desc->job_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
 	if (opt.tmpdisk > -1)
 		desc->job_min_tmp_disk = opt.tmpdisk;
 	if (opt.overcommit) {
diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c
index 365782b68d9..263b70c204a 100644
--- a/src/sbatch/opt.c
+++ b/src/sbatch/opt.c
@@ -122,7 +122,7 @@
 #define LONG_OPT_NTASKSPERNODE   0x136
 #define LONG_OPT_NTASKSPERSOCKET 0x137
 #define LONG_OPT_NTASKSPERCORE   0x138
-#define LONG_OPT_TASK_MEM        0x13a
+#define LONG_OPT_MEM_PER_CPU     0x13a
 #define LONG_OPT_HINT            0x13b
 #define LONG_OPT_BLRTS_IMAGE     0x140
 #define LONG_OPT_LINUX_IMAGE     0x141
@@ -269,7 +269,7 @@ static void _opt_default()
 	opt.minsockets      = -1;
 	opt.mincores        = -1;
 	opt.minthreads      = -1;
-	opt.task_mem	    = -1;
+	opt.mem_per_cpu	    = -1;
 	opt.realmem	    = -1;
 	opt.tmpdisk	    = -1;
 
@@ -523,8 +523,9 @@ static struct option long_options[] = {
 	{"mincores",      required_argument, 0, LONG_OPT_MINCORES},
 	{"minthreads",    required_argument, 0, LONG_OPT_MINTHREADS},
 	{"mem",           required_argument, 0, LONG_OPT_MEM},
-	{"job-mem",       required_argument, 0, LONG_OPT_TASK_MEM},
-	{"task-mem",      required_argument, 0, LONG_OPT_TASK_MEM},
+	{"job-mem",       required_argument, 0, LONG_OPT_MEM_PER_CPU},
+	{"task-mem",      required_argument, 0, LONG_OPT_MEM_PER_CPU},
+	{"mem-per-cpu",   required_argument, 0, LONG_OPT_MEM_PER_CPU},
 	{"hint",          required_argument, 0, LONG_OPT_HINT},
 	{"tmp",           required_argument, 0, LONG_OPT_TMP},
 	{"jobid",         required_argument, 0, LONG_OPT_JOBID},
@@ -1150,14 +1151,13 @@ static void _set_options(int argc, char **argv)
 				exit(1);
 			}
 			break;
-		case LONG_OPT_TASK_MEM:
-			opt.task_mem = (int) str_to_bytes(optarg);
-			if (opt.task_mem < 0) {
+		case LONG_OPT_MEM_PER_CPU:
+			opt.mem_per_cpu = (int) str_to_bytes(optarg);
+			if (opt.mem_per_cpu < 0) {
 				error("invalid memory constraint %s", 
 				      optarg);
 				exit(1);
 			}
-			setenvf(NULL, "SLURM_TASK_MEM", "%d", opt.task_mem);
 			break;
 		case LONG_OPT_TMP:
 			opt.tmpdisk = str_to_bytes(optarg);
@@ -1773,15 +1773,11 @@ static bool _opt_verify(void)
 		verified = false;
 	}
 
-        /* When CR with memory as a CR is enabled we need to assign
-	 * adequate value or check the value to opt.mem */
-	if ((opt.realmem >= -1) && (opt.task_mem > 0)) {
-		if (opt.realmem == -1) {
-			opt.realmem = opt.task_mem;
-		} else if (opt.realmem < opt.task_mem) {
-			info("mem < task-mem - resizing mem to be equal "
-			     "to task-mem");
-			opt.realmem = opt.task_mem;
+	if ((opt.realmem > -1) && (opt.mem_per_cpu > -1)) {
+		if (opt.realmem < opt.mem_per_cpu) {
+			info("mem < mem-per-cpu - resizing mem to be equal "
+			     "to mem-per-cpu");
+			opt.realmem = opt.mem_per_cpu;
 		}
 	}
 	
@@ -1979,8 +1975,8 @@ static char *print_constraints()
 	if (opt.realmem > 0)
 		xstrfmtcat(buf, "mem=%dM ", opt.realmem);
 
-	if (opt.task_mem > 0)
-		xstrfmtcat(buf, "task-mem=%dM ", opt.task_mem);
+	if (opt.mem_per_cpu > 0)
+		xstrfmtcat(buf, "mem-per-cpu=%dM ", opt.mem_per_cpu);
 
 	if (opt.tmpdisk > 0)
 		xstrfmtcat(buf, "tmp=%ld ", opt.tmpdisk);
@@ -2154,7 +2150,7 @@ static void _usage(void)
 "              [--mail-type=type] [--mail-user=user][--nice[=value]]\n"
 "              [--requeue] [--no-requeue] [--ntasks-per-node=n] [--propagate]\n"
 "              [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n"
-"              [--network=type]\n"
+"              [--network=type] [--mem-per-cpu=MB]\n"
 "              executable [args...]\n");
 }
 
@@ -2219,9 +2215,8 @@ static void _help(void)
 "Consumable resources related options:\n" 
 "      --exclusive             allocate nodes in exclusive mode when\n" 
 "                              cpu consumable resource is enabled\n"
-"      --task-mem=MB           maximum amount of real memory per task\n"
-"                              required by the job.\n" 
-"                              --mem >= --job-mem if --mem is specified.\n" 
+"      --mem-per-cpu=MB        maximum amount of real memory per CPU\n"
+"                              allocated to the job.\n" 
 "\n"
 "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" 
 "  -B --extra-node-info=S[:C[:T]]            Expands to:\n"
diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h
index e2403dac574..7d83a3c3cc7 100644
--- a/src/sbatch/opt.h
+++ b/src/sbatch/opt.h
@@ -113,7 +113,7 @@ typedef struct sbatch_options {
 	int minsockets;		/* --minsockets=n		*/
 	int mincores;		/* --mincores=n			*/
 	int minthreads;		/* --minthreads=n		*/
-	int task_mem;		/* --task-mem=n			*/
+	int mem_per_cpu;	/* --mem-per-cpu=n		*/
 	int realmem;		/* --mem=n			*/
 	long tmpdisk;		/* --tmp=n			*/
 	char *constraints;	/* --constraints=, -C constraint*/
diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c
index 03d0377895b..92d42e91df7 100644
--- a/src/sbatch/sbatch.c
+++ b/src/sbatch/sbatch.c
@@ -227,6 +227,8 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc)
 		desc->job_min_threads = opt.minthreads;
 	if (opt.realmem > -1)
 		desc->job_min_memory = opt.realmem;
+	else if (opt.mem_per_cpu > -1)
+		desc->job_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
 	if (opt.tmpdisk > -1)
 		desc->job_min_tmp_disk = opt.tmpdisk;
 	if (opt.overcommit) {
diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c
index 49ae2df8ab6..b981d761dfd 100644
--- a/src/scontrol/update_job.c
+++ b/src/scontrol/update_job.c
@@ -324,12 +324,19 @@ scontrol_update_job (int argc, char *argv[])
 						(char **) NULL, 10);
 			update_cnt++;
 		}
-		else if (strncasecmp(argv[i], "MinMemory=", 10) == 0) {
+		else if (strncasecmp(argv[i], "MinMemoryNode=", 14) == 0) {
 			job_msg.job_min_memory = 
-				(uint32_t) strtol(&argv[i][10], 
+				(uint32_t) strtol(&argv[i][14], 
 						(char **) NULL, 10);
 			update_cnt++;
 		}
+		else if (strncasecmp(argv[i], "MinMemoryCPU=", 13) == 0) {
+			job_msg.job_min_memory =
+				(uint32_t) strtol(&argv[i][13],
+						(char **) NULL, 10);
+			job_msg.job_min_memory |= MEM_PER_CPU;
+			update_cnt++;
+		}
 		else if (strncasecmp(argv[i], "MinTmpDisk=", 11) == 0) {
 			job_msg.job_min_tmp_disk = 
 				(uint32_t) strtol(&argv[i][11], 
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 12da6e651c4..fb07c2300c8 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -2,8 +2,6 @@
  *  job_mgr.c - manage the job information of slurm
  *	Note: there is a global job list (job_list), time stamp 
  *	(last_job_update), and hash table (job_hash)
- *
- *  $Id$
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008 Lawrence Livermore National Security.
@@ -1228,7 +1226,7 @@ void dump_job_desc(job_desc_msg_t * job_specs)
 	long kill_on_node_fail, shared, immediate;
 	long cpus_per_task, requeue, num_tasks, overcommit;
 	long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
-	char buf[100];
+	char *mem_type, buf[100];
 
 	if (job_specs == NULL)
 		return;
@@ -1262,12 +1260,21 @@ void dump_job_desc(job_desc_msg_t * job_specs)
 	debug3("   job_min_cores=%ld job_min_threads=%ld",
 	       job_min_cores, job_min_threads);
 
-	job_min_memory   = (job_specs->job_min_memory != NO_VAL) ? 
-		(long) job_specs->job_min_memory : -1L;
+	if (job_specs->job_min_memory == NO_VAL) {
+		job_min_memory = -1L;
+		mem_type = "job";
+	} else if (job_specs->job_min_memory & MEM_PER_CPU) {
+		job_min_memory = (long) (job_specs->job_min_memory &
+					 (~MEM_PER_CPU));
+		mem_type = "cpu";
+	} else {
+		job_min_memory = (long) job_specs->job_min_memory;
+		mem_type = "job";
+	}
 	job_min_tmp_disk = (job_specs->job_min_tmp_disk != NO_VAL) ? 
 		(long) job_specs->job_min_tmp_disk : -1L;
-	debug3("   job_min_memory=%ld job_min_tmp_disk=%ld",
-	       job_min_memory, job_min_tmp_disk);
+	debug3("   min_memory_%s=%ld job_min_tmp_disk=%ld",
+	       mem_type, job_min_memory, job_min_tmp_disk);
 	immediate = (job_specs->immediate == 0) ? 0L : 1L;
 	debug3("   immediate=%ld features=%s",
 	       immediate, job_specs->features);
@@ -2854,6 +2861,53 @@ static char *_copy_nodelist_no_dup(char *node_list)
 	return xstrdup(buf);
 }
 
+static bool _valid_job_min_mem(job_desc_msg_t * job_desc_msg)
+{
+	uint32_t base_size = job_desc_msg->job_min_memory;
+	uint32_t size_limit = slurmctld_conf.max_mem_per_task;
+	uint16_t cpus_per_node;
+
+	if (size_limit == 0)
+		return true;
+
+	if ((base_size  & MEM_PER_CPU) && (size_limit & MEM_PER_CPU)) {
+		base_size  &= (~MEM_PER_CPU);
+		size_limit &= (~MEM_PER_CPU);
+		if (base_size <= size_limit)
+			return true;
+		return false;
+	}
+
+	if (((base_size  & MEM_PER_CPU) == 0) &&
+	    ((size_limit & MEM_PER_CPU) == 0)) {
+		if (base_size <= size_limit)
+			return true;
+		return false;
+	}
+
+	/* Our size is per CPU and limit per node or vise-versa.
+	 * CPU count my vary by node, but we don't have a good
+	 * way to identify specific nodes for the job at this 
+	 * point, so just pick the first node as a basis for 
+	 * enforcing MaxMemPerCPU. */
+	if (slurmctld_conf.fast_schedule)
+		cpus_per_node = node_record_table_ptr[0].config_ptr->cpus;
+	else
+		cpus_per_node = node_record_table_ptr[0].cpus;
+	if (job_desc_msg->num_procs != NO_VAL)
+		cpus_per_node = MIN(cpus_per_node, job_desc_msg->num_procs);
+	if (base_size & MEM_PER_CPU) {
+		base_size &= (~MEM_PER_CPU);
+		base_size *= cpus_per_node;
+	} else {
+		size_limit &= (~MEM_PER_CPU);
+		size_limit *= cpus_per_node;
+	}
+	if (base_size <= size_limit)
+		return true;
+	return false;
+}
+
 /* 
  * job_time_limit - terminate jobs which have exceeded their time limit
  * global: job_list - pointer global job list
@@ -3010,6 +3064,12 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 			job_desc_msg->nice = NICE_OFFSET;
 	}
 
+	if (job_desc_msg->job_min_memory == NO_VAL) {
+		/* Default memory limit is DefMemPerCPU (if set) or no limit */
+		job_desc_msg->job_min_memory = slurmctld_conf.def_mem_per_task;
+	} else if (!_valid_job_min_mem(job_desc_msg))
+		return ESLURM_INVALID_TASK_MEMORY;
+
 	if (job_desc_msg->min_sockets == (uint16_t) NO_VAL)
 		job_desc_msg->min_sockets = 1;	/* default socket count of 1 */
 	if (job_desc_msg->min_cores == (uint16_t) NO_VAL)
@@ -3035,8 +3095,6 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 		job_desc_msg->job_min_cores = 1;   /* default 1 core per socket */
 	if (job_desc_msg->job_min_threads == (uint16_t) NO_VAL)
 		job_desc_msg->job_min_threads = 1; /* default 1 thread per core */
-	if (job_desc_msg->job_min_memory == NO_VAL)
-		job_desc_msg->job_min_memory = 0;  /* default no memory limit */
 	if (job_desc_msg->job_min_tmp_disk == NO_VAL)
 		job_desc_msg->job_min_tmp_disk = 0;/* default 0MB disk per node */
 
@@ -3900,12 +3958,16 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 	if (job_specs->job_min_memory != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
-		else if (super_user
-			 || (detail_ptr->job_min_memory
-			     > job_specs->job_min_memory)) {
+		else if (super_user) {
+			char *entity;
+			if (job_specs->job_min_memory & MEM_PER_CPU)
+				entity = "cpu";
+			else
+				entity = "job";
 			detail_ptr->job_min_memory = job_specs->job_min_memory;
-			info("update_job: setting job_min_memory to %u for "
-			     "job_id %u", job_specs->job_min_memory, 
+			info("update_job: setting min_memory_%s to %u for "
+			     "job_id %u", entity, 
+			     (job_specs->job_min_memory & (~MEM_PER_CPU)), 
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase job_min_memory for job %u",
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 0bc50f8e961..c8870005075 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -1196,7 +1196,8 @@ extern int job_req_node_filter(struct job_record *job_ptr,
 		FREE_NULL_BITMAP(feature_bitmap);
 		if (slurmctld_conf.fast_schedule) {
 			if ((detail_ptr->job_min_procs    > config_ptr->cpus       )
-			||  (detail_ptr->job_min_memory   > config_ptr->real_memory) 
+			||  ((detail_ptr->job_min_memory & (~MEM_PER_CPU)) > 
+			      config_ptr->real_memory) 
 			||  (detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk)) {
 				bit_clear(avail_bitmap, i);
 				continue;
@@ -1213,7 +1214,8 @@ extern int job_req_node_filter(struct job_record *job_ptr,
 			}
 		} else {
 			if ((detail_ptr->job_min_procs    > node_ptr->cpus       )
-			||  (detail_ptr->job_min_memory   > node_ptr->real_memory) 
+			||  ((detail_ptr->job_min_memory & (~MEM_PER_CPU)) >
+			      node_ptr->real_memory) 
 			||  (detail_ptr->job_min_tmp_disk > node_ptr->tmp_disk)) {
 				bit_clear(avail_bitmap, i);
 				continue;
@@ -1284,7 +1286,8 @@ static int _build_node_list(struct job_record *job_ptr,
 
 		config_filter = 0;
 		if ((detail_ptr->job_min_procs    > config_ptr->cpus       )
-		||  (detail_ptr->job_min_memory   > config_ptr->real_memory) 
+		||  ((detail_ptr->job_min_memory & (~MEM_PER_CPU)) > 
+		      config_ptr->real_memory) 
 		||  (detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk))
 			config_filter = 1;
 		if (mc_ptr
@@ -1391,7 +1394,8 @@ static void _filter_nodes_in_set(struct node_set *node_set_ptr,
 
 			node_con = node_record_table_ptr[i].config_ptr;
 			if ((job_con->job_min_procs    <= node_con->cpus)
-			&&  (job_con->job_min_memory   <= node_con->real_memory)
+			&&  ((job_con->job_min_memory & (~MEM_PER_CPU)) <= 
+			      node_con->real_memory)
 			&&  (job_con->job_min_tmp_disk <= node_con->tmp_disk))
 				job_ok = 1;
 			if (mc_ptr
@@ -1419,7 +1423,8 @@ static void _filter_nodes_in_set(struct node_set *node_set_ptr,
 
 			node_ptr = &node_record_table_ptr[i];
 			if ((job_con->job_min_procs    <= node_ptr->cpus)
-			&&  (job_con->job_min_memory   <= node_ptr->real_memory)
+			&&  ((job_con->job_min_memory & (~MEM_PER_CPU)) <= 
+			      node_ptr->real_memory)
 			&&  (job_con->job_min_tmp_disk <= node_ptr->tmp_disk))
 				job_ok = 1;
 			if (mc_ptr
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 2fc834f8d82..d156b1b8a94 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -331,7 +331,8 @@ struct job_details {
 	uint16_t ntasks_per_node;	/* number of tasks on each node */
 	/* job constraints: */
 	uint32_t job_min_procs;		/* minimum processors per node */
-	uint32_t job_min_memory;	/* minimum memory per node, MB */
+	uint32_t job_min_memory;	/* minimum memory per node (MB) OR
+					 * memory per allocated CPU | MEM_PER_CPU */
 	uint32_t job_min_tmp_disk;	/* minimum tempdisk per node, MB */
 	char *err;			/* pathname of job's stderr file */
 	char *in;			/* pathname of job's stdin file */
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index dab7c08eebe..a7cdb943379 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -616,7 +616,8 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid,
 {
 	slurm_cred_arg_t arg;
 	hostset_t        hset    = NULL;
-	bool             user_ok = _slurm_authorized_user(uid); 
+	bool             user_ok = _slurm_authorized_user(uid);
+	bool             verified = true;
 	int              host_index = -1;
 	int              rc;
 	slurm_cred_t     cred = req->cred;
@@ -628,24 +629,18 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid,
 	 * credentials are checked
 	 */
 	if ((rc = slurm_cred_verify(conf->vctx, cred, &arg)) < 0) {
-		if (!user_ok) {
+		verified = false;
+		if (!user_ok)
 			return SLURM_ERROR;
-		} else {
+		else {
 			debug("_check_job_credential slurm_cred_verify failed:"
 			      " %m, but continuing anyway.");
 		}
 	}
 
-	/* Overwrite any memory limits in the RPC with 
-	 * contents of the credential */
-	req->job_mem  = arg.job_mem;
-	req->task_mem = arg.task_mem;
-
-	/*
-	 * If uid is the slurm user id or root, do not bother
-	 * performing validity check of the credential
-	 */
-	if (user_ok) {
+	/* If uid is the SlurmUser or root and the credential is bad,
+	 * then do not attempt validating the credential */
+	if (!verified) {
 		*step_hset = NULL;
 		if (rc >= 0) {
 			if ((hset = hostset_create(arg.hostlist)))
@@ -684,12 +679,11 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid,
 	}
 
         if ((arg.alloc_lps_cnt > 0) && (tasks_to_launch > 0)) {
-
                 host_index = hostset_find(hset, conf->node_name);
 
                 /* Left in here for debugging purposes */
 #if(0)
-                if(host_index >= 0)
+                if (host_index >= 0)
                   info(" cons_res %u alloc_lps_cnt %u "
 			"task[%d] = %u = task_to_launch %d host %s ", 
 			arg.jobid, arg.alloc_lps_cnt, host_index, 
@@ -714,6 +708,20 @@ _check_job_credential(launch_tasks_request_msg_t *req, uid_t uid,
 		}
         }
 
+	/* Overwrite any memory limits in the RPC with 
+	 * contents of the credential */
+	if (arg.job_mem & MEM_PER_CPU) {
+		req->job_mem = arg.job_mem & (~MEM_PER_CPU);
+		if (host_index >= 0)
+			req->job_mem *= arg.alloc_lps[host_index];
+	} else
+		req->job_mem = arg.job_mem;
+	req->task_mem = arg.task_mem;	/* Defunct */
+#if 0
+	info("mem orig:%u cpus:%u limit:%u", 
+	     arg.job_mem, arg.alloc_lps[host_index], req->job_mem);
+#endif
+
 	*step_hset = hset;
 	xfree(arg.hostlist);
 	arg.alloc_lps_cnt = 0;
diff --git a/src/squeue/print.c b/src/squeue/print.c
index 0985f5ed2d1..74b01063683 100644
--- a/src/squeue/print.c
+++ b/src/squeue/print.c
@@ -892,6 +892,7 @@ int _print_job_min_memory(job_info_t * job, int width, bool right_justify,
 		_print_str("MIN_MEMORY", width, right_justify, true);
 	else {
 	    	tmp_char[0] = '\0';
+		job->job_min_memory &= (~MEM_PER_CPU);
 		convert_num_unit((float)job->job_min_memory, min_mem, 
 				 sizeof(min_mem), UNIT_NONE);
 		strcat(tmp_char, min_mem);
diff --git a/src/squeue/sort.c b/src/squeue/sort.c
index 05c42f4f7e3..2a0bce6371b 100644
--- a/src/squeue/sort.c
+++ b/src/squeue/sort.c
@@ -455,6 +455,8 @@ static int _sort_job_by_min_memory(void *void1, void *void2)
 	job_info_t *job1 = (job_info_t *) void1;
 	job_info_t *job2 = (job_info_t *) void2;
 
+	job1->job_min_memory &= (~MEM_PER_CPU);
+	job2->job_min_memory &= (~MEM_PER_CPU);
 	diff = job1->job_min_memory - job2->job_min_memory;
 
 	if (reverse_order)
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 8f7cb8715c0..c06508cd172 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -450,7 +450,9 @@ job_desc_msg_create_from_opts ()
 	if (opt.job_min_threads != NO_VAL)
 		j->job_min_threads  = opt.job_min_threads;
 	if (opt.job_min_memory != NO_VAL)
-		j->job_min_memory   = opt.job_min_memory;
+		j->job_min_memory = opt.job_min_memory;
+	else if (opt.mem_per_cpu != NO_VAL)
+		j->job_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
 	if (opt.job_min_tmp_disk != NO_VAL)
 		j->job_min_tmp_disk = opt.job_min_tmp_disk;
 	if (opt.overcommit) {
@@ -511,8 +513,6 @@ create_job_step(srun_job_t *job)
 		: (opt.nprocs*opt.cpus_per_task);
 	
 	job->ctx_params.relative = (uint16_t)opt.relative;
-	if (opt.task_mem != NO_VAL)
-		job->ctx_params.mem_per_task = (uint16_t)opt.task_mem;
 	job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval;
 	job->ctx_params.ckpt_path = opt.ckpt_path;
 	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 97d161f93a4..94bfff45fc2 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -1,8 +1,8 @@
 /*****************************************************************************\
  *  opt.c - options processing for srun
- *  $Id$
  *****************************************************************************
- *  Copyright (C) 2002-2006 The Regents of the University of California.
+ *  Copyright (C) 2002-2007 The Regents of the University of California.
+ *  Copyright (C) 2008 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Mark Grondona <grondona1@llnl.gov>, et. al.
  *  LLNL-CODE-402394.
@@ -153,7 +153,7 @@
 #define LONG_OPT_NTASKSPERNODE	 0x136
 #define LONG_OPT_NTASKSPERSOCKET 0x137
 #define LONG_OPT_NTASKSPERCORE	 0x138
-#define LONG_OPT_TASK_MEM        0x13a
+#define LONG_OPT_MEM_PER_CPU     0x13a
 #define LONG_OPT_HINT	         0x13b
 #define LONG_OPT_BLRTS_IMAGE     0x140
 #define LONG_OPT_LINUX_IMAGE     0x141
@@ -656,7 +656,7 @@ static void _opt_default()
 	opt.job_min_cores   = NO_VAL;
 	opt.job_min_threads = NO_VAL;
 	opt.job_min_memory  = NO_VAL;
-	opt.task_mem        = NO_VAL;
+	opt.mem_per_cpu     = NO_VAL;
 	opt.job_min_tmp_disk= NO_VAL;
 
 	opt.hold	    = false;
@@ -777,7 +777,6 @@ env_vars_t env_vars[] = {
 {"SLURM_EXCLUSIVE",     OPT_EXCLUSIVE,  NULL,               NULL             },
 {"SLURM_OPEN_MODE",     OPT_OPEN_MODE,  NULL,               NULL             },
 {"SLURM_ACCTG_FREQ",    OPT_INT,        &opt.acctg_freq,    NULL             },
-{"SLURM_TASK_MEM",      OPT_INT,        &opt.task_mem,      NULL             },
 {"SLURM_NETWORK",       OPT_STRING,     &opt.network,       NULL             },
 {NULL, 0, NULL, NULL}
 };
@@ -991,8 +990,9 @@ static void set_options(const int argc, char **argv)
 		{"mincores",         required_argument, 0, LONG_OPT_MINCORES},
 		{"minthreads",       required_argument, 0, LONG_OPT_MINTHREADS},
 		{"mem",              required_argument, 0, LONG_OPT_MEM},
-		{"job-mem",          required_argument, 0, LONG_OPT_TASK_MEM},
-		{"task-mem",         required_argument, 0, LONG_OPT_TASK_MEM},
+		{"job-mem",          required_argument, 0, LONG_OPT_MEM_PER_CPU},
+		{"task-mem",         required_argument, 0, LONG_OPT_MEM_PER_CPU},
+		{"mem-per-cpu",      required_argument, 0, LONG_OPT_MEM_PER_CPU},
 		{"hint",             required_argument, 0, LONG_OPT_HINT},
 		{"mpi",              required_argument, 0, LONG_OPT_MPI},
 		{"tmp",              required_argument, 0, LONG_OPT_TMP},
@@ -1314,9 +1314,9 @@ static void set_options(const int argc, char **argv)
 				exit(1);
 			}
 			break;
-		case LONG_OPT_TASK_MEM:
-			opt.task_mem = (int) str_to_bytes(optarg);
-			if (opt.task_mem < 0) {
+		case LONG_OPT_MEM_PER_CPU:
+			opt.mem_per_cpu = (int) str_to_bytes(optarg);
+			if (opt.mem_per_cpu < 0) {
 				error("invalid memory constraint %s", 
 				      optarg);
 				exit(1);
@@ -1626,15 +1626,11 @@ static void _opt_args(int argc, char **argv)
 
 	set_options(argc, argv);
 
-        /* When CR with memory as a CR is enabled we need to assign
-	 * adequate value or check the value to opt.mem */
-	if ((opt.job_min_memory >= -1) && (opt.task_mem > 0)) {
-		if (opt.job_min_memory == -1) {
-			opt.job_min_memory = opt.task_mem;
-		} else if (opt.job_min_memory < opt.task_mem) {
-			info("mem < task-mem - resizing mem to be equal "
-			     "to task-mem");
-			opt.job_min_memory = opt.task_mem;
+	if ((opt.job_min_memory > -1) && (opt.mem_per_cpu > -1)) {
+		if (opt.job_min_memory < opt.mem_per_cpu) {
+			info("mem < mem-per-cpu - resizing mem to be equal "
+			     "to mem-per-cpu");
+			opt.job_min_memory = opt.mem_per_cpu;
 		}
 	}
 
@@ -2030,19 +2026,6 @@ static bool _opt_verify(void)
 		xfree(sched_name);
 	}
 
-	if (opt.task_mem > 0) {
-		uint32_t max_mem = slurm_get_max_mem_per_task();
-		if (max_mem && (opt.task_mem > max_mem)) {
-			info("WARNING: Reducing --task-mem to system maximum "
-			     "of %u MB", max_mem);
-			opt.task_mem = max_mem;
-		}	
-	} else {
-		uint32_t max_mem = slurm_get_def_mem_per_task();
-		if (max_mem)
-			opt.task_mem = max_mem;
-	}
-
 	return verified;
 }
 
@@ -2069,8 +2052,8 @@ static char *print_constraints()
 	if (opt.job_min_memory > 0)
 		xstrfmtcat(buf, "mem=%dM ", opt.job_min_memory);
 
-	if (opt.task_mem > 0)
-		xstrfmtcat(buf, "task-mem=%dM ", opt.task_mem);
+	if (opt.mem_per_cpu > 0)
+		xstrfmtcat(buf, "mem-per-cpu=%dM ", opt.mem_per_cpu);
 
 	if (opt.job_min_tmp_disk > 0)
 		xstrfmtcat(buf, "tmp=%ld ", opt.job_min_tmp_disk);
@@ -2223,7 +2206,7 @@ static void _usage(void)
 "            [--kill-on-bad-exit] [--propagate[=rlimits] [--comment=name]\n"
 "            [--cpu_bind=...] [--mem_bind=...] [--network=type]\n"
 "            [--ntasks-per-node=n] [--ntasks-per-socket=n]\n"
-"            [--ntasks-per-core=n]\n"
+"            [--ntasks-per-core=n] [--mem-per-cpu=MB]\n"
 #ifdef HAVE_BG		/* Blue gene specific options */
 "            [--geometry=XxYxZ] [--conn-type=type] [--no-rotate] [--reboot]\n"
 "            [--blrts-image=path] [--linux-image=path]\n"
@@ -2321,8 +2304,8 @@ static void _help(void)
 "      --exclusive             allocate nodes in exclusive mode when\n" 
 "                              cpu consumable resource is enabled\n"
 "                              or don't share CPUs for job steps\n"
-"      --task-mem=MB           maximum amount of real memory per task\n"
-"                              required by the job.\n" 
+"      --mem-per-cpu=MB        maximum amount of real memory per allocated\n"
+"                              CPU required by the job.\n" 
 "                              --mem >= --job-mem if --mem is specified.\n" 
 "\n"
 "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" 
diff --git a/src/srun/opt.h b/src/srun/opt.h
index f4668e37e2e..394051e9e76 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -170,7 +170,7 @@ typedef struct srun_options {
 	int32_t job_min_cores;	/* --mincores=n			*/
 	int32_t job_min_threads;/* --minthreads=n		*/
 	int32_t job_min_memory;	/* --mem=n			*/
-	int32_t task_mem;	/* --task-mem=n			*/
+	int32_t mem_per_cpu;	/* --mem-per-cpu=n		*/
 	long job_min_tmp_disk;	/* --tmp=n			*/
 	char *constraints;	/* --constraints=, -C constraint*/
 	bool contiguous;	/* --contiguous			*/
diff --git a/testsuite/expect/test1.23 b/testsuite/expect/test1.23
index ab47df6699f..2eb71e58cdb 100755
--- a/testsuite/expect/test1.23
+++ b/testsuite/expect/test1.23
@@ -108,7 +108,7 @@ set host_0      ""
 set timeout $max_job_delay
 set srun_pid [spawn $srun -N1 -l --mem=999999 -t1 $bin_hostname]
 expect {
-	-re "configuration is not available" {
+	-re "not available" {
 		send_user "This error is expected, no worries\n"
 		set err_msg 1
 		exp_continue
diff --git a/testsuite/expect/test15.7 b/testsuite/expect/test15.7
index ee9a6e126d0..f928c5ba58f 100755
--- a/testsuite/expect/test15.7
+++ b/testsuite/expect/test15.7
@@ -101,7 +101,7 @@ expect {
 		}
 		exp_continue
 	}
-	-re "MinMemory=($number)" {
+	-re "MinMemoryNode=($number)" {
 		set read_mem $expect_out(1,string)
 		if {$read_mem == $mem_size} {
 			incr matches
diff --git a/testsuite/expect/test17.10 b/testsuite/expect/test17.10
index e50a538e22a..d2b11f3a26f 100755
--- a/testsuite/expect/test17.10
+++ b/testsuite/expect/test17.10
@@ -104,7 +104,7 @@ expect {
 		}
 		exp_continue
 	}
-	-re "MinMemory=($number)" {
+	-re "MinMemoryNode=($number)" {
 		set read_mem $expect_out(1,string)
 		if {$read_mem == $mem_size} {
 			incr matches
-- 
GitLab