Skip to content
Snippets Groups Projects
Commit 0344cdf3 authored by Morris Jette's avatar Morris Jette
Browse files

salloc: support for Cray container IDs

This adds detection and use of SGI process aggregate job container IDs for
salloc interactive sessions.

The preferred and documented way to support this on a Cray system is by
enabling the provided pam_job.so via /etc/pam.d/common-session.

There is a header dependency on job.h. This file depends on the optional
cray-libjob-devel package, which installs into /opt/cray/job/<version>.

This package is however not always installed or may not be up-to-date.
Hence the patch "cheats" by duplicating the known prototype of job_getjid().
parent 4545a1f0
No related branches found
No related tags found
No related merge requests found
...@@ -70,10 +70,14 @@ ...@@ -70,10 +70,14 @@
#ifdef HAVE_BG #ifdef HAVE_BG
#include "src/common/node_select.h" #include "src/common/node_select.h"
#include "src/plugins/select/bluegene/bg_enums.h" #include "src/plugins/select/bluegene/bg_enums.h"
#endif #elif defined(HAVE_CRAY)
#include "src/common/node_select.h"
#ifndef __USE_XOPEN_EXTENDED /*
extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */ * On Cray installations, the libjob headers are not automatically installed
* by default, while libjob.so always is, and kernels are > 2.6. Hence it is
* simpler to just duplicate the single declaration here.
*/
extern uint64_t job_getjid(pid_t pid);
#endif #endif
#define MAX_RETRIES 10 #define MAX_RETRIES 10
...@@ -543,6 +547,25 @@ static void _set_submit_dir_env(void) ...@@ -543,6 +547,25 @@ static void _set_submit_dir_env(void)
/* Returns 0 on success, -1 on failure */ /* Returns 0 on success, -1 on failure */
static int _fill_job_desc_from_opts(job_desc_msg_t *desc) static int _fill_job_desc_from_opts(job_desc_msg_t *desc)
{ {
#ifdef HAVE_CRAY
uint64_t pagg_id = job_getjid(getpid());
/*
* Interactive sessions require pam_job.so in /etc/pam.d/common-session
* since creating sgi_job containers requires root permissions. This is
* the only exception where we allow the fallback of using the SID to
* confirm the reservation (caught later, in do_basil_confirm).
*/
if (pagg_id == (uint64_t)-1) {
error("No SGI job container ID detected - please enable the "
"Cray job service via /etc/init.d/job");
} else {
if (!desc->select_jobinfo)
desc->select_jobinfo = select_g_select_jobinfo_alloc();
select_g_select_jobinfo_set(desc->select_jobinfo,
SELECT_JOBDATA_PAGG_ID, &pagg_id);
}
#endif
desc->contiguous = opt.contiguous ? 1 : 0; desc->contiguous = opt.contiguous ? 1 : 0;
desc->features = opt.constraints; desc->features = opt.constraints;
desc->gres = opt.gres; desc->gres = opt.gres;
......
...@@ -777,9 +777,9 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) ...@@ -777,9 +777,9 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
} }
#if HAVE_CRAY #if HAVE_CRAY
/* /*
* We are using the alloc_sid as unique identifier to confirm the ALPS * Catch attempts to nest salloc sessions. It is not possible to use an
* reservation. ALPS will refuse any attempt to create a second session * ALPS session which has the same alloc_sid, it fails even if PAGG
* with the same identifier, hence sessions may not be nested. * container IDs are used.
*/ */
if (allocated_session_in_use(job_desc_msg)) { if (allocated_session_in_use(job_desc_msg)) {
error_code = ESLURM_RESERVATION_BUSY; error_code = ESLURM_RESERVATION_BUSY;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment