From 0344cdf364739af09e2f00b199aa163c60700a1c Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 26 Apr 2011 21:36:56 -0700 Subject: [PATCH] salloc: support for Cray container IDs This adds detection and use of SGI process aggregate job container IDs for salloc interactive sessions. The preferred and documented way to support this on a Cray system is by enabling the provided pam_job.so via /etc/pam.d/common-session. There is a header dependency on job.h. This file depends on the optional cray-libjob-devel package, which installs into /opt/cray/job/<version>. This package is however not always installed or may not be up-to-date. Hence the patch "cheats" by duplicating the known prototype of job_getjid(). --- src/salloc/salloc.c | 31 +++++++++++++++++++++++++++---- src/slurmctld/proc_req.c | 6 +++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 8b5645b7298..68510b40024 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -70,10 +70,14 @@ #ifdef HAVE_BG #include "src/common/node_select.h" #include "src/plugins/select/bluegene/bg_enums.h" -#endif - -#ifndef __USE_XOPEN_EXTENDED -extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */ +#elif defined(HAVE_CRAY) +#include "src/common/node_select.h" +/* + * On Cray installations, the libjob headers are not automatically installed + * by default, while libjob.so always is, and kernels are > 2.6. Hence it is + * simpler to just duplicate the single declaration here. + */ +extern uint64_t job_getjid(pid_t pid); #endif #define MAX_RETRIES 10 @@ -543,6 +547,25 @@ static void _set_submit_dir_env(void) /* Returns 0 on success, -1 on failure */ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) { +#ifdef HAVE_CRAY + uint64_t pagg_id = job_getjid(getpid()); + /* + * Interactive sessions require pam_job.so in /etc/pam.d/common-session + * since creating sgi_job containers requires root permissions. This is + * the only exception where we allow the fallback of using the SID to + * confirm the reservation (caught later, in do_basil_confirm). + */ + if (pagg_id == (uint64_t)-1) { + error("No SGI job container ID detected - please enable the " + "Cray job service via /etc/init.d/job"); + } else { + if (!desc->select_jobinfo) + desc->select_jobinfo = select_g_select_jobinfo_alloc(); + + select_g_select_jobinfo_set(desc->select_jobinfo, + SELECT_JOBDATA_PAGG_ID, &pagg_id); + } +#endif desc->contiguous = opt.contiguous ? 1 : 0; desc->features = opt.constraints; desc->gres = opt.gres; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 80722779576..44d0c65138c 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -777,9 +777,9 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) } #if HAVE_CRAY /* - * We are using the alloc_sid as unique identifier to confirm the ALPS - * reservation. ALPS will refuse any attempt to create a second session - * with the same identifier, hence sessions may not be nested. + * Catch attempts to nest salloc sessions. It is not possible to use an + * ALPS session which has the same alloc_sid, it fails even if PAGG + * container IDs are used. */ if (allocated_session_in_use(job_desc_msg)) { error_code = ESLURM_RESERVATION_BUSY; -- GitLab