diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 91f3ec1d0249f0fa32b042fe34d19534e89a201c..2f8acf872968c435636d2452d21364e1f5ddc2c2 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -268,7 +268,10 @@ void slurm_free_launch_tasks_response_msg(launch_tasks_response_msg_t * void slurm_free_kill_job_msg(kill_job_msg_t * msg) { - xfree(msg); + if (msg) { + select_g_free_jobinfo(&msg->select_jobinfo); + xfree(msg); + } } void slurm_free_update_job_time_msg(job_time_msg_t * msg) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 805b903b51561a9c922853ecaeba74f83132d805..480ceb211cdb7520f26be1daf5e30927bc355992 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -316,9 +316,14 @@ typedef struct return_code_msg { int32_t return_code; } return_code_msg_t; +/* Note: We include select_jobinfo here in addition to the job launch + * RPC in order to insure reliable clean-up of a BlueGene partition in + * the event of some launch failure or race condition preventing slurmd + * from getting the BGL_PARTITION_ID at that time */ typedef struct kill_job_msg { uint32_t job_id; uint32_t job_uid; + select_jobinfo_t select_jobinfo; /* opaque data type */ } kill_job_msg_t; typedef struct job_time_msg { diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index c2d47a1d2dc049b83626656f5ebb0976cd947fad..7da0911cbe80940482b2c41c14147fc52570227e 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1267,6 +1267,7 @@ _pack_kill_job_msg(kill_job_msg_t * msg, Buf buffer) pack32(msg->job_id, buffer); pack32(msg->job_uid, buffer); + select_g_pack_jobinfo(msg->select_jobinfo, buffer); } static int @@ -1282,6 +1283,10 @@ _unpack_kill_job_msg(kill_job_msg_t ** msg, Buf buffer) safe_unpack32(&(tmp_ptr->job_id), buffer); safe_unpack32(&(tmp_ptr->job_uid), buffer); + if (select_g_alloc_jobinfo (&tmp_ptr->select_jobinfo) + || select_g_unpack_jobinfo(tmp_ptr->select_jobinfo, buffer)) + goto unpack_error; + return SLURM_SUCCESS; unpack_error: diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 1e06fa74dcfe2670cfd1d99e7ce71ed19df02e97..6c4e1b2a663f3eca7ab8e3ff591182b90e6a1480 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>, et. al. + * Written by Morris Jette <jette1@llnl.gov>, Kevin Tew <tew1@llnl.gov> * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -345,9 +345,9 @@ int main(int argc, char *argv[]) /* Plugins are needed to purge job/node data structures, * unplug after other data structures are purged */ - slurm_select_fini(); g_slurm_jobcomp_fini(); slurm_sched_fini(); + slurm_select_fini(); checkpoint_fini(); slurm_auth_fini(); switch_fini(); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 3c1631dfc7dc099471b2064b532f6038b2b71976..4d594ec68a0c9528d7738373e1377562132a0389 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -168,6 +168,8 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout) last_node_update = time(NULL); kill_job->job_id = job_ptr->job_id; kill_job->job_uid = job_ptr->user_id; + kill_job->select_jobinfo = select_g_copy_jobinfo( + job_ptr->select_jobinfo); for (i = 0; i < node_record_count; i++) { struct node_record *node_ptr = &node_record_table_ptr[i]; @@ -1149,6 +1151,8 @@ extern void re_kill_job(struct job_record *job_ptr) kill_job = xmalloc(sizeof(kill_job_msg_t)); kill_job->job_id = job_ptr->job_id; kill_job->job_uid = job_ptr->user_id; + kill_job->select_jobinfo = select_g_copy_jobinfo( + job_ptr->select_jobinfo); for (i = 0; i < node_record_count; i++) { struct node_record *node_ptr = &node_record_table_ptr[i]; diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index f995bb94cbd1c3769919b55a70e31dab4c41c941..b527092b2329bd9dc411be0cc5f7772eeb8b2546 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -256,12 +256,14 @@ mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli, } /* - * Run a prolog or epilog script. - * returns -1 on failure. - * + * Run a prolog or epilog script. Sets environment variables: + * SLURM_JOBID = jobid, SLURM_UID=uid, and + * BGL_PARTITION_ID=bgl_part_id (if not NULL) + * Returns -1 on failure. */ -int -run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid) +extern int +run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid, + char *bgl_part_id) { int status; pid_t cpid; @@ -293,6 +295,8 @@ run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid) env[0] = NULL; setenvpf(&env, "SLURM_JOBID", "%u", jobid); setenvpf(&env, "SLURM_UID", "%u", uid); + if (bgl_part_id) + setenvpf(&env, "BGL_PARTITION_ID", "%s", bgl_part_id); execve(path, argv, env); error("help! %m"); diff --git a/src/slurmd/mgr.h b/src/slurmd/mgr.h index e012676b9d8659c814b80777370933b2bace4169..d7243b33bbc4b5166cb8e3455b9086e0ecd4e7a6 100644 --- a/src/slurmd/mgr.h +++ b/src/slurmd/mgr.h @@ -50,8 +50,12 @@ int mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *client, int mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *client); /* - * Run epilog or prolog on this node + * Run a prolog or epilog script. Sets environment variables: + * SLURM_JOBID = jobid, SLURM_UID=uid, and + * BGL_PARTITION_ID=bgl_part_id (if not NULL) + * Returns -1 on failure. */ -int run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid); +extern int run_script(bool prolog, const char *path, uint32_t jobid, uid_t uid, + char *bgl_part_id); #endif diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 27b790d9065723cde80c567951418cd314bc12ea..7857cc0b82942b06c0b4ab113db4dc3acabf996b 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -41,6 +41,7 @@ #include "src/common/hostlist.h" #include "src/common/log.h" #include "src/common/macros.h" +#include "src/common/node_select.c" #include "src/common/slurm_auth.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" @@ -77,8 +78,8 @@ static void _rpc_shutdown(slurm_msg_t *msg, slurm_addr *cli_addr); static void _rpc_reconfig(slurm_msg_t *msg, slurm_addr *cli_addr); static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *); static int _rpc_ping(slurm_msg_t *, slurm_addr *); -static int _run_prolog(uint32_t jobid, uid_t uid); -static int _run_epilog(uint32_t jobid, uid_t uid); +static int _run_prolog(uint32_t jobid, uid_t uid, char *bgl_part_id); +static int _run_epilog(uint32_t jobid, uid_t uid, char *bgl_part_id); static int _spawn_task(spawn_task_request_msg_t *, slurm_addr *, slurm_addr *); @@ -395,7 +396,7 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) /* xassert(slurm_cred_jobid_cached(conf->vctx, req->job_id));*/ /* Run job prolog if necessary */ - if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) { + if (run_prolog && (_run_prolog(req->job_id, req->uid, NULL) != 0)) { error("[job %u] prolog failed", req->job_id); errnum = ESLURMD_PROLOG_FAILED; goto done; @@ -469,7 +470,7 @@ _rpc_spawn_task(slurm_msg_t *msg, slurm_addr *cli) /* xassert(slurm_cred_jobid_cached(conf->vctx, req->job_id));*/ /* Run job prolog if necessary */ - if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) { + if (run_prolog && (_run_prolog(req->job_id, req->uid, NULL) != 0)) { error("[job %u] prolog failed", req->job_id); errnum = ESLURMD_PROLOG_FAILED; goto done; @@ -505,6 +506,7 @@ _rpc_batch_job(slurm_msg_t *msg, slurm_addr *cli) batch_job_launch_msg_t *req = (batch_job_launch_msg_t *)msg->data; int rc = SLURM_SUCCESS; uid_t req_uid = g_slurm_auth_get_uid(msg->cred); + char *bgl_part_id = NULL; if (!_slurm_authorized_user(req_uid)) { error("Security violation, batch launch RPC from uid %u", @@ -516,7 +518,11 @@ _rpc_batch_job(slurm_msg_t *msg, slurm_addr *cli) /* * Run job prolog on this node */ - if (_run_prolog(req->job_id, req->uid) != 0) { + select_g_get_jobinfo(req->select_jobinfo, SELECT_DATA_PART_ID, + &bgl_part_id); + rc = _run_prolog(req->job_id, req->uid, bgl_part_id); + xfree(bgl_part_id); + if (rc != 0) { error("[job %u] prolog failed", req->job_id); rc = ESLURMD_PROLOG_FAILED; goto done; @@ -966,6 +972,7 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) uid_t uid = g_slurm_auth_get_uid(msg->cred); int nsteps = 0; int delay; + char *bgl_part_id = NULL; /* * check that requesting user ID is the SLURM UID @@ -1055,8 +1062,11 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) } save_cred_state(conf->vctx); - - if (_run_epilog(req->job_id, req->job_uid) != 0) { + select_g_get_jobinfo(req->select_jobinfo, SELECT_DATA_PART_ID, + &bgl_part_id); + rc = _run_epilog(req->job_id, req->job_uid, bgl_part_id); + xfree(bgl_part_id); + if (rc != 0) { error ("[job %u] epilog failed", req->job_id); rc = ESLURMD_EPILOG_FAILED; } else @@ -1169,23 +1179,23 @@ _rpc_update_time(slurm_msg_t *msg, slurm_addr *cli) } static int -_run_prolog(uint32_t jobid, uid_t uid) +_run_prolog(uint32_t jobid, uid_t uid, char *bgl_part_id) { int error_code; slurm_mutex_lock(&conf->config_mutex); - error_code = run_script(true, conf->prolog, jobid, uid); + error_code = run_script(true, conf->prolog, jobid, uid, bgl_part_id); slurm_mutex_unlock(&conf->config_mutex); return error_code; } static int -_run_epilog(uint32_t jobid, uid_t uid) +_run_epilog(uint32_t jobid, uid_t uid, char *bgl_part_id) { int error_code; slurm_mutex_lock(&conf->config_mutex); - error_code = run_script(false, conf->epilog, jobid, uid); + error_code = run_script(false, conf->epilog, jobid, uid, bgl_part_id); slurm_mutex_unlock(&conf->config_mutex); return error_code; }