From 87d03dd8ccaccc6775155e9f9f23c439c05b1ac1 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 14 Sep 2010 21:22:41 +0000 Subject: [PATCH] Add support for SLURM_CLUSTERS environment variable in the sbatch command. --- NEWS | 1 + doc/man/man1/sbatch.1 | 23 +++++++++++++---------- src/sbatch/mult_cluster.c | 20 +++++++++++--------- src/sbatch/opt.c | 8 +++++--- src/sbatch/sbatch.c | 6 +++--- src/slurmctld/controller.c | 5 +++-- 6 files changed, 36 insertions(+), 27 deletions(-) diff --git a/NEWS b/NEWS index 12e353ad20e..e4fc8550bd2 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,7 @@ documents those changes that are of interest to users and admins. partitions and use lua metatables to reference the job and partition fields. -- Add support for serveral new trigger types: SlurmDBD failure/restart, Database failure/restart, Slurmctld failure/restart. + -- Add support for SLURM_CLUSTERS environment variable in the sbatch command. * Changes in SLURM 2.2.0.pre10 ============================== diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index a90b979001c..8f098d8eb51 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1,4 +1,4 @@ -.TH "sbatch" "1" "SLURM 2.2" "April 2010" "SLURM Commands" +.TH "sbatch" "1" "SLURM 2.2" "September 2010" "SLURM Commands" .SH "NAME" sbatch \- Submit a batch script to SLURM. @@ -126,15 +126,6 @@ The default value is the current working directory. Checkpoint files will be of the form "<job_id>.ckpt" for jobs and "<job_id>.<step_id>.ckpt" for job steps. -.TP -\fB\-\-clusters\fR=<\fIstring\fR> -Clusters to issue commands to. Multiple cluster names may be comma separated. -The job will be submitted to the one cluster providing the earliest expected -job initiation time. The default value is the current cluster. A value of -of '\fIall\fR' will query to run on all clusters. Note the -\fB\-\-export\fR option to control environment variables exported -between clusters. - .TP \fB\-\-comment\fR=<\fIstring\fR> An arbitrary comment. @@ -502,6 +493,15 @@ License names can be followed by an asterisk and count Multiple license names should be comma separated (e.g. "\-\-licenses=foo*4,bar"). +.TP +\fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> +Clusters to issue commands to. Multiple cluster names may be comma separated. +The job will be submitted to the one cluster providing the earliest expected +job initiation time. The default value is the current cluster. A value of +of '\fIall\fR' will query to run on all clusters. Note the +\fB\-\-export\fR option to control environment variables exported +between clusters. + .TP \fB\-m\fR, \fB\-\-distribution\fR= <\fIblock\fR|\fIcyclic\fR|\fIarbitrary\fR|\fIplane=<options>\fR> @@ -1070,6 +1070,9 @@ Same as \fB\-\-checkpoint\fR \fBSLURM_CHECKPOINT_DIR\fR Same as \fB\-\-checkpoint\-dir\fR .TP +\fBSBATCH_CLUSTERS\fR or \fBSLURM_CLUSTERS\fR +Same as \fB\-\-clusters\fR +.TP \fBSBATCH_CONN_TYPE\fR Same as \fB\-\-conn\-type\fR .TP diff --git a/src/sbatch/mult_cluster.c b/src/sbatch/mult_cluster.c index 2b203317f0c..7e52f366398 100644 --- a/src/sbatch/mult_cluster.c +++ b/src/sbatch/mult_cluster.c @@ -108,7 +108,7 @@ local_cluster_rec_t *_job_will_run (job_desc_msg_t *req) slurm_seterrno(rc); break; case RESPONSE_JOB_WILL_RUN: - if(working_cluster_rec->flags & CLUSTER_FLAG_BG) + if (working_cluster_rec->flags & CLUSTER_FLAG_BG) type = "cnodes"; will_run_resp = (will_run_response_msg_t *) resp_msg.data; slurm_make_time_str(&will_run_resp->start_time, @@ -123,7 +123,7 @@ local_cluster_rec_t *_job_will_run (job_desc_msg_t *req) if (will_run_resp->preemptee_job_id) { local_cluster->preempt_cnt = list_count(will_run_resp->preemptee_job_id); - if(opt.verbose >= LOG_LEVEL_DEBUG) { + if (opt.verbose >= LOG_LEVEL_DEBUG) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; @@ -160,9 +160,9 @@ extern int sbatch_set_first_avail_cluster(job_desc_msg_t *req) List ret_list = NULL; /* return if we only have 1 or less clusters here */ - if(!opt.clusters || !list_count(opt.clusters)) { + if (!opt.clusters || !list_count(opt.clusters)) { return rc; - } else if(list_count(opt.clusters) == 1) { + } else if (list_count(opt.clusters) == 1) { working_cluster_rec = list_peek(opt.clusters); return rc; } @@ -174,12 +174,14 @@ extern int sbatch_set_first_avail_cluster(job_desc_msg_t *req) } ret_list = list_create(_destroy_local_cluster_rec); + if (ret_list == NULL) + fatal("list_create malloc failure"); itr = list_iterator_create(opt.clusters); - while((working_cluster_rec = list_next(itr))) { - if((local_cluster = _job_will_run(req))) { - if(!ret_list) + while ((working_cluster_rec = list_next(itr))) { + if ((local_cluster = _job_will_run(req))) { + if (!ret_list) ret_list = list_create( - _destroy_local_cluster_rec); + _destroy_local_cluster_rec); list_append(ret_list, local_cluster); } else error("Problem with submit to cluster %s: %m", @@ -190,7 +192,7 @@ extern int sbatch_set_first_avail_cluster(job_desc_msg_t *req) if (host_set) req->alloc_node = NULL; - if(!ret_list) { + if (!ret_list) { error("Can't run on any of the clusters given"); return SLURM_ERROR; } diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 9622febec05..6b725d8b97e 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -438,6 +438,8 @@ env_vars_t env_vars[] = { {"SBATCH_BLRTS_IMAGE", OPT_STRING, &opt.blrtsimage, NULL }, {"SBATCH_CHECKPOINT", OPT_STRING, &opt.ckpt_interval_str, NULL }, {"SBATCH_CHECKPOINT_DIR",OPT_STRING, &opt.ckpt_dir, NULL }, + {"SBATCH_CLUSTERS", OPT_STRING, &opt.clusters, NULL }, + {"SLURM_CLUSTERS", OPT_STRING, &opt.clusters, NULL }, {"SBATCH_CNLOAD_IMAGE", OPT_STRING, &opt.linuximage, NULL }, {"SBATCH_CONN_TYPE", OPT_CONN_TYPE, NULL, NULL }, {"SBATCH_CPU_BIND", OPT_CPU_BIND, NULL, NULL }, @@ -1210,10 +1212,10 @@ static void _set_options(int argc, char **argv) } break; case 'M': - if(opt.clusters) + if (opt.clusters) list_destroy(opt.clusters); - if(!(opt.clusters = - slurmdb_get_info_cluster(optarg))) { + if (!(opt.clusters = + slurmdb_get_info_cluster(optarg))) { error("'%s' invalid entry for --clusters", optarg); exit(1); diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 94344b6865d..7f0d08dfb79 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -148,8 +148,8 @@ int main(int argc, char *argv[]) desc.script = (char *)script_body; /* If can run on multiple clusters find the earliest run time - and run it there */ - if(sbatch_set_first_avail_cluster(&desc) != SLURM_SUCCESS) + * and run it there */ + if (sbatch_set_first_avail_cluster(&desc) != SLURM_SUCCESS) exit(error_exit); while (slurm_submit_batch_job(&desc, &resp) < 0) { @@ -180,7 +180,7 @@ int main(int argc, char *argv[]) } printf("Submitted batch job %u", resp->job_id); - if(working_cluster_rec) + if (working_cluster_rec) printf(" on cluster %s", working_cluster_rec->name); printf("\n"); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index c6f316e7900..4167d60ac2f 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -360,10 +360,11 @@ int main(int argc, char *argv[]) assoc_init_arg.remove_assoc_notify = _remove_assoc; assoc_init_arg.remove_qos_notify = _remove_qos; assoc_init_arg.cache_level = ASSOC_MGR_CACHE_ASSOC | - ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS; + ASSOC_MGR_CACHE_USER | + ASSOC_MGR_CACHE_QOS; if (assoc_mgr_init(acct_db_conn, &assoc_init_arg)) { - if(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) + if (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) error("Association database appears down, " "reading from state file."); else -- GitLab