Skip to content
Snippets Groups Projects
Commit ac21f730 authored by Morris Jette's avatar Morris Jette
Browse files

select/cray: use pagg ID to confirm reservations

This uses the SGI container process aggregate ID to confirm the job reservation.

It falls back to using the alloc_sid in case of failure. This fallback
should be considered really only as a last resort, since it is known that
session IDs are not unique across multiple login nodes and therefore the
confirmation of ALPS reservations will fail whenever there is a SID collision
(the likelihood increases with system size).
parent 00b792bd
No related branches found
No related tags found
No related merge requests found
...@@ -716,6 +716,7 @@ extern int do_basil_reserve(struct job_record *job_ptr) ...@@ -716,6 +716,7 @@ extern int do_basil_reserve(struct job_record *job_ptr)
extern int do_basil_confirm(struct job_record *job_ptr) extern int do_basil_confirm(struct job_record *job_ptr)
{ {
uint32_t resv_id; uint32_t resv_id;
uint64_t pagg_id;
if (_get_select_jobinfo(job_ptr->select_jobinfo->data, if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
...@@ -723,20 +724,32 @@ extern int do_basil_confirm(struct job_record *job_ptr) ...@@ -723,20 +724,32 @@ extern int do_basil_confirm(struct job_record *job_ptr)
} else if (resv_id == 0) { } else if (resv_id == 0) {
/* On Cray XT/XE, a reservation ID of 0 is always invalid. */ /* On Cray XT/XE, a reservation ID of 0 is always invalid. */
error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
} else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
error("can not read pagg ID for JobId=%u", job_ptr->job_id);
} else { } else {
/* basil_confirm logs the error and rc-encodes the error type */ int rc;
int rc = basil_confirm(resv_id, job_ptr->job_id,
job_ptr->alloc_sid); if (pagg_id == 0) {
/* This fallback case is for interactive jobs only */
error("JobId %u has no pagg ID, falling back to SID",
job_ptr->job_id);
pagg_id = job_ptr->alloc_sid;
}
rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
if (rc == 0) { if (rc == 0) {
debug2("confirmed ALPS resId %u for JobId %u, pagg %u", debug2("confirmed ALPS resId %u for JobId %u, "
resv_id, job_ptr->job_id, job_ptr->alloc_sid); "pagg %"PRIu64"",
resv_id, job_ptr->job_id, pagg_id);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} } else {
error("confirming ALPS resId %u, pagg %u FAILED with %d", error("confirming ALPS resId %u of JobId %u FAILED: %s",
resv_id, job_ptr->alloc_sid, rc); resv_id, job_ptr->job_id, basil_strerror(rc));
if (is_transient_error(rc)) if (is_transient_error(rc))
return READY_JOB_ERROR; return READY_JOB_ERROR;
}
} }
return READY_JOB_FATAL; return READY_JOB_FATAL;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment