diff --git a/META b/META index f8f6e5a5b79829fa49de864468f810214d21cb7e..fbe6e9222c1887af3f868beb75610ec6e52a8759 100644 --- a/META +++ b/META @@ -10,4 +10,4 @@ Micro: 0 Version: 0.3.0 Release: 1 - API: 0x010000 + API: 0x010100 diff --git a/NEWS b/NEWS index d5c9f17b71675fac5ea039b36fb2fcb87954b758..a6f8de32c51d33ae9472a0041281a6890645ad41 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,9 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. -* Changes in SLURM 0.3.1 +* Changes in SLURM 0.3.1 (not tagged yet) ======================== + -- Set SLURM_TASKS_PER_NODE env var for batch jobs (and LAM/MPI). -- Fix for slurmd spinning when stdin buffers full (gnats:434) * Changes in SLURM 0.3.0 diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 64d0b76a3fc60e232dfdf58cadc4df4bdac93233..1579bf6535c79a25ff4d7a92391d7f0dfee608c2 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -145,6 +145,8 @@ void slurm_free_job_launch_msg(batch_job_launch_msg_t * msg) if (msg) { xfree(msg->nodes); + xfree(msg->cpus_per_node); + xfree(msg->cpu_count_reps); xfree(msg->script); xfree(msg->err); xfree(msg->in); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 2febadd293442dc64b72ec01a005f81ca8c0f344..47c7f3add66c29d10363d0f7e1c35422c7807d8a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -325,6 +325,9 @@ typedef struct batch_job_launch_msg { uint32_t job_id; uint32_t uid; uint32_t nprocs; /* number of tasks in this job */ + uint16_t num_cpu_groups;/* elements in below cpu arrays */ + uint32_t *cpus_per_node;/* cpus per node */ + uint32_t *cpu_count_reps;/* how many nodes have same cpu count */ char *nodes; /* list of nodes allocated to job_step */ char *script; /* the actual job script, default NONE */ char *err; /* pathname of stderr */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 8d29d678c52d9073f4664d0d3f282f8d3bcfa4de..94c9c49b3c53814ab073bb5fa945f3ef5892b123 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Kevin Tew <tew1@llnl.gov>, Moe Jette <jette1@llnl.gov>, et. al. + * Written by Kevin Tew <tew1@llnl.gov>, et. al. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -2486,6 +2486,10 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer) pack32(msg->job_id, buffer); pack32(msg->uid, buffer); + pack16(msg->num_cpu_groups, buffer); + pack32_array(msg->cpus_per_node, msg->num_cpu_groups, buffer); + pack32_array(msg->cpu_count_reps, msg->num_cpu_groups, buffer); + packstr(msg->nodes, buffer); packstr(msg->script, buffer); packstr(msg->work_dir, buffer); @@ -2505,6 +2509,7 @@ static int _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer) { uint16_t uint16_tmp; + uint32_t uint32_tmp; batch_job_launch_msg_t *launch_msg_ptr; assert(msg != NULL); @@ -2514,6 +2519,23 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer) safe_unpack32(&launch_msg_ptr->job_id, buffer); safe_unpack32(&launch_msg_ptr->uid, buffer); + safe_unpack16(&launch_msg_ptr->num_cpu_groups, buffer); + if (launch_msg_ptr->num_cpu_groups > 0) { + safe_unpack32_array((uint32_t **) & + (launch_msg_ptr->cpus_per_node), &uint32_tmp, + buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + safe_unpack32_array((uint32_t **) & + (launch_msg_ptr->cpu_count_reps), &uint32_tmp, + buffer); + if (launch_msg_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + } else { + launch_msg_ptr->cpus_per_node = NULL; + launch_msg_ptr->cpu_count_reps = NULL; + } + safe_unpackstr_xmalloc(&launch_msg_ptr->nodes, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&launch_msg_ptr->script, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&launch_msg_ptr->work_dir, &uint16_tmp, diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 3aa75a0b373a3630e24a0632768fcb7d48ed9171..9a47443f49723ee7bca3f536c9c9aedf73fa42e7 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>, et. al. + * Written by Morris Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>, et. al. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -338,7 +338,7 @@ static void _init_config(void) * Set the (per thread) stack size to a * more "reasonable" value to avoid running * out of virtual memory and dying */ - rlim.rlim_cur = 1024 * 1024; + rlim.rlim_cur = rlim.rlim_max; (void) setrlimit(RLIMIT_STACK, &rlim); } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 599115011f6e51eeb906f3801ea28bd2d7f09bbc..9793abfd3fd20b9385b942d50ad577fa4a143e83 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -4,7 +4,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Jette <jette1@llnl.gov> + * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -269,6 +269,16 @@ static void _launch_job(struct job_record *job_ptr) launch_msg_ptr->environment = get_job_env(job_ptr, &launch_msg_ptr->envc); + launch_msg_ptr->num_cpu_groups = job_ptr->num_cpu_groups; + launch_msg_ptr->cpus_per_node = xmalloc(sizeof(uint32_t) * + job_ptr->num_cpu_groups); + memcpy(launch_msg_ptr->cpus_per_node, job_ptr->cpus_per_node, + (sizeof(uint32_t) * job_ptr->num_cpu_groups)); + launch_msg_ptr->cpu_count_reps = xmalloc(sizeof(uint32_t) * + job_ptr->num_cpu_groups); + memcpy(launch_msg_ptr->cpu_count_reps, job_ptr->cpu_count_reps, + (sizeof(uint32_t) * job_ptr->num_cpu_groups)); + agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t)); agent_arg_ptr->node_count = 1; agent_arg_ptr->retry = 0; diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index dc6ffa919b4d6b3046501116d90e6a5d17bf82ae..329a06b96ad295a6ef4feead374f65a197910937 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -132,13 +132,13 @@ static void _setargs(slurmd_job_t *job); static void _set_launch_ip_in_env(slurmd_job_t *, slurm_addr *cli); static void _random_sleep(slurmd_job_t *job); - +static char *_sprint_task_cnt(batch_job_launch_msg_t *msg); /* * Batch job mangement prototypes: */ static char * _make_batch_dir(slurmd_job_t *job); static char * _make_batch_script(batch_job_launch_msg_t *msg, char *path); -static int _setup_batch_env(slurmd_job_t *job, char *nodes); +static int _setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg); static int _complete_job(uint32_t jobid, int err, int status); @@ -205,7 +205,7 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) if ((job->argv[0] = _make_batch_script(msg, batchdir)) == NULL) goto cleanup2; - if ((rc = _setup_batch_env(job, msg->nodes)) < 0) + if ((rc = _setup_batch_env(job, msg)) < 0) goto cleanup2; status = _job_mgr(job); @@ -910,10 +910,10 @@ _make_batch_script(batch_job_launch_msg_t *msg, char *path) } static int -_setup_batch_env(slurmd_job_t *job, char *nodes) +_setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg) { - char buf[1024]; - hostlist_t hl = hostlist_create(nodes); + char buf[1024], *task_buf; + hostlist_t hl = hostlist_create(msg->nodes); if (!hl) return SLURM_ERROR; @@ -924,9 +924,34 @@ _setup_batch_env(slurmd_job_t *job, char *nodes) setenvpf(&job->env, "SLURM_NODELIST", "%s", buf); hostlist_destroy(hl); + task_buf = _sprint_task_cnt(msg); + setenvpf(&job->env, "SLURM_TASKS_PER_NODE", "%s", task_buf); + xfree(task_buf); + return 0; } +static char * +_sprint_task_cnt(batch_job_launch_msg_t *msg) +{ + int i; + char *task_str = xstrdup(""); + char tmp[16], *comma = ""; + + for (i=0; i<msg->num_cpu_groups; i++) { + if (i == 1) + comma = ","; + if (msg->cpu_count_reps[i] > 1) + sprintf(tmp, "%s%d(x%d)", comma, msg->cpus_per_node[i], + msg->cpu_count_reps[i]); + else + sprintf(tmp, "%s%d", comma, msg->cpus_per_node[i]); + xstrcat(task_str, tmp); + } + + return task_str; +} + static void _send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc) {