From be5880e4db5e66e15c41a4860c402377f4ec9828 Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Thu, 20 May 2004 23:14:14 +0000 Subject: [PATCH] o job_desc_msg_create() -> job_desc_msg_create_from_options() o Use job_desc_msg_create_from_options() to build job_desc_msg for batch jobs (less places to update in future) --- src/srun/allocate.c | 23 +++++++++- src/srun/allocate.h | 6 +-- src/srun/env.c | 11 +++++ src/srun/env.h | 5 +++ src/srun/launch.c | 13 +----- src/srun/srun.c | 102 +++++++++----------------------------------- 6 files changed, 62 insertions(+), 98 deletions(-) diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 57f67a66ecc..b63c6624590 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -67,7 +67,7 @@ allocate_nodes(void) SigFunc *oquitf, *ointf, *otermf; sigset_t oset; resource_allocation_response_msg_t *resp = NULL; - job_desc_msg_t *j = job_desc_msg_create(); + job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL); oquitf = xsignal(SIGQUIT, _intr_handler); ointf = xsignal(SIGINT, _intr_handler); @@ -214,8 +214,9 @@ _intr_handler(int signo) * (see opt.h) */ job_desc_msg_t * -job_desc_msg_create(void) +job_desc_msg_create_from_opts (const char *script) { + extern char **environ; job_desc_msg_t *j = xmalloc(sizeof(*j)); slurm_init_job_desc_msg(j); @@ -265,6 +266,24 @@ job_desc_msg_create(void) else j->host = NULL; + if (script) { + /* + * If script is set then we are building a request for + * a batch job + */ + xassert (opt.batch); + + j->environment = environ; + j->env_size = envcount (environ); + j->script = script; + j->argv = remote_argv; + j->argc = remote_argc; + j->err = opt.efname; + j->in = opt.ifname; + j->out = opt.ofname; + j->work_dir = opt.cwd; + } + return (j); } diff --git a/src/srun/allocate.h b/src/srun/allocate.h index b3fa5e19315..4b8da6d0fc4 100644 --- a/src/srun/allocate.h +++ b/src/srun/allocate.h @@ -44,16 +44,16 @@ resource_allocation_response_msg_t * allocate_nodes(void); /* * Create a job_desc_msg_t object, filled in from the current srun options - * (see opt.h) + * (see opt.h), if script != NULL then this is a batch job. * The resulting memory must be freed with job_desc_msg_destroy() */ -job_desc_msg_t * job_desc_msg_create(void); +job_desc_msg_t * job_desc_msg_create_from_opts (const char *script); /* * Destroy (free memory from) a job_desc_msg_t object allocated with * job_desc_msg_create() */ -void job_desc_msg_destroy(job_desc_msg_t *j); +void job_desc_msg_destroy (job_desc_msg_t *j); /* * Check for SLURM_JOBID environment variable, and if it is a valid diff --git a/src/srun/env.c b/src/srun/env.c index 21b60b0c05d..96305a117c0 100644 --- a/src/srun/env.c +++ b/src/srun/env.c @@ -33,3 +33,14 @@ setenvf(const char *fmt, ...) return putenv(bufcpy); } +/* + * Return the number of elements in the environment `env' + */ +int +envcount (const char **env) +{ + int envc = 0; + while (env[envc] != NULL) + envc++; + return (envc); +} diff --git a/src/srun/env.h b/src/srun/env.h index ede76189f53..4d74bc5a92a 100644 --- a/src/srun/env.h +++ b/src/srun/env.h @@ -14,4 +14,9 @@ */ int setenvf(const char *fmt, ...); +/* + * Return the number of elements in the environment array `env' + */ +int envcount (char **env); + #endif /* _HAVE_ENV_H */ diff --git a/src/srun/launch.c b/src/srun/launch.c index 0d53e984d61..1f1600c7ace 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -43,6 +43,7 @@ #include "src/srun/job.h" #include "src/srun/launch.h" #include "src/srun/opt.h" +#include "src/srun/env.h" extern char **environ; @@ -73,7 +74,6 @@ static void _p_launch(slurm_msg_t *req_array_ptr, job_t *job); static void * _p_launch_task(void *args); static void _print_launch_msg(launch_tasks_request_msg_t *msg, char * hostname); -static int _envcount(char **env); int launch_thr_create(job_t *job) @@ -109,7 +109,7 @@ launch(void *arg) msg_array_ptr = xmalloc(sizeof(launch_tasks_request_msg_t)*job->nhosts); req_array_ptr = xmalloc(sizeof(slurm_msg_t) * job->nhosts); - my_envc = _envcount(environ); + my_envc = envcount(environ); for (i = 0; i < job->nhosts; i++) { launch_tasks_request_msg_t *r = &msg_array_ptr[i]; slurm_msg_t *m = &req_array_ptr[i]; @@ -445,12 +445,3 @@ _print_launch_msg(launch_tasks_request_msg_t *msg, char * hostname) debug3("uid:%ld cwd:%s %d", (long) msg->uid, msg->cwd, msg->srun_node_id); } - -static int -_envcount(char **environ) -{ - int envc = 0; - while (environ[envc] != NULL) - envc++; - return envc; -} diff --git a/src/srun/srun.c b/src/srun/srun.c index 52128f78179..026baa520b0 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -345,10 +345,10 @@ _run_batch_job(void) { int file_type, retries; int rc = SLURM_SUCCESS; - job_desc_msg_t job; + job_desc_msg_t *req; submit_response_msg_t *resp; - extern char **environ; - char *job_script; + char *script; + void (*log_msg) (const char *fmt, ...) = (void (*)) &error; if ((remote_argc == 0) || (remote_argv[0] == NULL)) return SLURM_ERROR; @@ -361,100 +361,38 @@ _run_batch_job(void) * } */ - job_script = _build_script (remote_argv[0], file_type); - if (job_script == NULL) { + if ((script = _build_script (remote_argv[0], file_type)) == NULL) { error ("unable to build script from file %s", remote_argv[0]); return SLURM_ERROR; } - slurm_init_job_desc_msg(&job); - - job.contiguous = opt.contiguous; - job.features = opt.constraints; - - job.name = opt.job_name; - - job.partition = opt.partition; - - if (opt.hold) - job.priority = 0; - if (opt.mincpus > -1) - job.min_procs = opt.mincpus; - if (opt.realmem > -1) - job.min_memory = opt.realmem; - if (opt.tmpdisk > -1) - job.min_tmp_disk = opt.tmpdisk; - - job.req_nodes = opt.nodelist; - job.exc_nodes = opt.exc_nodes; - - if (opt.overcommit) - job.num_procs = opt.min_nodes; - else - job.num_procs = opt.nprocs * opt.cpus_per_task; - - job.min_nodes = opt.min_nodes; - if (opt.max_nodes) - job.max_nodes = opt.max_nodes; - - job.num_tasks = opt.nprocs; - - job.user_id = opt.uid; - job.group_id = getgid(); - - if (opt.hold) - job.priority = 0; - if (opt.no_kill) - job.kill_on_node_fail = 0; - if (opt.time_limit > -1) - job.time_limit = opt.time_limit; - if (opt.share) - job.shared = 1; - - /* _set_batch_script_env(job); */ - job.environment = environ; - - job.env_size = 0; - while (environ[job.env_size] != NULL) - job.env_size++; - - job.script = job_script; - job.argv = remote_argv; - job.argc = remote_argc; - job.err = opt.efname; - job.in = opt.ifname; - job.out = opt.ofname; - job.work_dir = opt.cwd; + if (!(req = job_desc_msg_create_from_opts (script))) + fatal ("Unable to create job request"); retries = 0; - while ((rc = slurm_submit_batch_job(&job, &resp)) < 0) { - if ( (errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) - && (retries < MAX_RETRIES) ) { - if (retries == 0) - error ("Slurm controller not responding, " - "sleeping and retrying"); - else - debug ("Slurm controller not responding, " - "sleeping and retrying"); + while ( (retries < MAX_RETRIES) + && (rc = slurm_submit_batch_job(req, &resp)) < 0) { - sleep (++retries); - } - else { - error("Unable to submit batch job resources: %s", - slurm_strerror(errno)); - return SLURM_ERROR; - } + if (errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) + return (error("Unable to submit batch job: %m")); + + (*log_msg) ("Controller not responding, retrying..."); + log_msg = &debug; + sleep (++retries); } if (rc == SLURM_SUCCESS) { - info("jobid %u submitted",resp->job_id); + info ("jobid %u submitted",resp->job_id); if (resp->error_code) info("Warning: %s", slurm_strerror(resp->error_code)); slurm_free_submit_response_response_msg (resp); } - xfree (job_script); - return rc; + + job_desc_msg_destroy (req); + xfree (script); + + return (rc); } /* _get_shell - return a string containing the default shell for this user -- GitLab