From c10aef0f5d807707cef45dd16432f14a79274bf1 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Wed, 24 Jun 2009 19:47:14 +0000 Subject: [PATCH] svn merge -r17951:17962 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0 --- NEWS | 2 + contribs/perlapi/libslurm-perl/alloc.c | 2 +- src/common/slurm_protocol_defs.c | 3 ++ src/common/slurm_protocol_pack.c | 1 + src/slurmd/slurmstepd/io.c | 8 +++- src/slurmd/slurmstepd/mgr.c | 57 +++++++++++++++++--------- testsuite/expect/test1.29 | 4 +- 7 files changed, 52 insertions(+), 25 deletions(-) diff --git a/NEWS b/NEWS index 78ca5dc4d61..dd70abc76fe 100644 --- a/NEWS +++ b/NEWS @@ -58,6 +58,8 @@ documents those changes that are of interest to users and admins. -- Permit node suspend/resume logic to be enabled through "scontrol reconfig" given appropriate changes to slurm configuration file. -- Check for return codes on functions with warn_unused_result set + -- Fix memory leak in getting step information + -- Better logging for when job's request bad output locations. * Changes in SLURM 2.0.3 ======================== diff --git a/contribs/perlapi/libslurm-perl/alloc.c b/contribs/perlapi/libslurm-perl/alloc.c index e6194392ab1..417e5da2673 100644 --- a/contribs/perlapi/libslurm-perl/alloc.c +++ b/contribs/perlapi/libslurm-perl/alloc.c @@ -140,7 +140,7 @@ hv_to_job_desc_msg(HV* hv, job_desc_msg_t* job_desc_msg) avp = (AV*)SvRV(*svp); for(i = 0; i < SYSTEM_DIMENSIONS; i ++) { if(! (svp = av_fetch(avp, i, FALSE))) { - Perl_warn(aTHX_ "geometry of dimension %s missing in job descriptor", i); + Perl_warn(aTHX_ "geometry of dimension %d missing in job descriptor", i); free_job_desc_msg_memory(job_desc_msg); return -1; } diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 905057b0d99..5c8871643c6 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1501,7 +1501,10 @@ static void _slurm_free_job_step_info_members (job_step_info_t * msg) { if (msg != NULL) { xfree(msg->partition); + xfree(msg->resv_ports); xfree(msg->nodes); + xfree(msg->name); + xfree(msg->network); xfree(msg->ckpt_dir); } } diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 8dcea5962b8..851eaa429cb 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2694,6 +2694,7 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer) unpack_error: xfree(step->partition); + xfree(step->resv_ports); xfree(step->nodes); xfree(step->name); xfree(step->network); diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index 680faec492e..a238ab3596a 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -945,7 +945,7 @@ _init_task_stdio_fds(slurmd_task_info_t *task, slurmd_job_t *job) /* open file on task's stdin */ debug5(" stdin file name = %s", task->ifname); if ((task->stdin_fd = open(task->ifname, O_RDONLY)) == -1) { - error("Could not open stdin file: %m"); + error("Could not open stdin file %s: %m", task->ifname); return SLURM_ERROR; } fd_set_close_on_exec(task->stdin_fd); @@ -999,6 +999,8 @@ _init_task_stdio_fds(slurmd_task_info_t *task, slurmd_job_t *job) debug5(" stdout file name = %s", task->ofname); task->stdout_fd = open(task->ofname, file_flags, 0666); if (task->stdout_fd == -1) { + error("Could not open stdout file %s: %m", + task->ofname); return SLURM_ERROR; } fd_set_close_on_exec(task->stdout_fd); @@ -1055,6 +1057,8 @@ _init_task_stdio_fds(slurmd_task_info_t *task, slurmd_job_t *job) debug5(" stderr file name = %s", task->efname); task->stderr_fd = open(task->efname, file_flags, 0666); if (task->stderr_fd == -1) { + error("Could not open stderr file %s: %m", + task->efname); return SLURM_ERROR; } fd_set_close_on_exec(task->stderr_fd); @@ -1278,7 +1282,7 @@ io_close_all(slurmd_job_t *job) * and log facility may still try to write to stderr. */ if ((devnull = open("/dev/null", O_RDWR)) < 0) { - error("Unable to open /dev/null: %m"); + error("Could not open /dev/null: %m"); } else { if (dup2(devnull, STDERR_FILENO) < 0) error("Unable to dup /dev/null onto stderr\n"); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index b29735a4edf..7684dc26930 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -174,7 +174,7 @@ static int _drop_privileges(slurmd_job_t *job, bool do_setuid, struct priv_state *state); static int _reclaim_privileges(struct priv_state *state); static void _send_launch_resp(slurmd_job_t *job, int rc); -static void _slurmd_job_log_init(slurmd_job_t *job); +static int _slurmd_job_log_init(slurmd_job_t *job); static void _wait_for_io(slurmd_job_t *job); static int _send_exit_msg(slurmd_job_t *job, uint32_t *tid, int n, int status); @@ -414,8 +414,12 @@ _setup_normal_io(slurmd_job_t *job) file_flags, job, job->labelio, job->task[ii]->id, same ? job->task[ii]->id : -2); - if (rc != SLURM_SUCCESS) + if (rc != SLURM_SUCCESS) { + error("Could not open output " + "file %s: %m", + job->task[ii]->ofname); return ESLURMD_IO_ERROR; + } } srun_stdout_tasks = -2; if (same) @@ -426,9 +430,12 @@ _setup_normal_io(slurmd_job_t *job) job->task[0]->ofname, file_flags, job, job->labelio, -1, same ? -1 : -2); - if (rc != SLURM_SUCCESS) + if (rc != SLURM_SUCCESS) { + error("Could not open output " + "file %s: %m", + job->task[0]->ofname); return ESLURMD_IO_ERROR; - + } srun_stdout_tasks = -2; if (same) srun_stderr_tasks = -2; @@ -443,8 +450,14 @@ _setup_normal_io(slurmd_job_t *job) file_flags, job, job->labelio, -2, job->task[ii]->id); - if (rc != SLURM_SUCCESS) + if (rc != SLURM_SUCCESS) { + error("Could not " + "open error " + "file %s: %m", + job->task[ii]-> + efname); return ESLURMD_IO_ERROR; + } } srun_stderr_tasks = -2; } else if (errpattern == SLURMD_ALL_SAME) { @@ -453,14 +466,16 @@ _setup_normal_io(slurmd_job_t *job) job->task[0]->efname, file_flags, job, job->labelio, -2, -1); - if (rc != SLURM_SUCCESS) + if (rc != SLURM_SUCCESS) { + error("Could not open error " + "file %s: %m", + job->task[0]->efname); return ESLURMD_IO_ERROR; - + } srun_stderr_tasks = -2; } } } - rc = io_initial_client_connect(srun, job, srun_stdout_tasks, srun_stderr_tasks); if (rc < 0) @@ -476,7 +491,7 @@ _setup_normal_io(slurmd_job_t *job) if (io_thread_start(job) < 0) return ESLURMD_IO_ERROR; } - + debug2("Leaving _setup_normal_io"); return SLURM_SUCCESS; } @@ -811,6 +826,13 @@ job_manager(slurmd_job_t *job) goto fail1; } +#ifndef NDEBUG +# ifdef PR_SET_DUMPABLE + if (prctl(PR_SET_DUMPABLE, 1) < 0) + debug ("Unable to set dumpable to 1"); +# endif /* PR_SET_DUMPABLE */ +#endif /* !NDEBUG */ + set_umask(job); /* set umask for stdout/err files */ if (job->user_managed_io) rc = _setup_user_managed_io(job); @@ -819,15 +841,9 @@ job_manager(slurmd_job_t *job) /* * Initialize log facility to copy errors back to srun */ - _slurmd_job_log_init(job); + if(!rc) + rc = _slurmd_job_log_init(job); -#ifndef NDEBUG -# ifdef PR_SET_DUMPABLE - if (prctl(PR_SET_DUMPABLE, 1) < 0) - debug ("Unable to set dumpable to 1"); -# endif /* PR_SET_DUMPABLE */ -#endif /* !NDEBUG */ - if (rc) { error("IO setup failed: %m"); rc = SLURM_SUCCESS; /* drains node otherwise */ @@ -1734,7 +1750,7 @@ _reclaim_privileges(struct priv_state *ps) } -static void +static int _slurmd_job_log_init(slurmd_job_t *job) { char argv0[64]; @@ -1761,15 +1777,16 @@ _slurmd_job_log_init(slurmd_job_t *job) log_alter(conf->log_opts, 0, NULL); log_set_argv0(argv0); - + /* Connect slurmd stderr to job's stderr */ if (!job->user_managed_io && job->task != NULL) { if (dup2(job->task[0]->stderr_fd, STDERR_FILENO) < 0) { error("job_log_init: dup2(stderr): %m"); - return; + return ESLURMD_IO_ERROR; } } verbose("debug level = %d", conf->log_opts.stderr_level); + return SLURM_SUCCESS; } diff --git a/testsuite/expect/test1.29 b/testsuite/expect/test1.29 index 828d2b79456..76d8a9114d3 100755 --- a/testsuite/expect/test1.29 +++ b/testsuite/expect/test1.29 @@ -220,8 +220,8 @@ expect { if {$matches != 5} { send_user "\nFAILURE: User limits not propagated got $matches matches\n" - send_user" Check $file_err for errors\n" - send_user" A long running slurmd could cause a file size limit error\n" + send_user "Check $file_err for errors\n" + send_user "A long running slurmd could cause a file size limit error\n" set exit_code 1 } -- GitLab