diff --git a/doc/html/job_container_plugins.shtml b/doc/html/job_container_plugins.shtml index 20cb9cf60ff1c84982063de406dc1040cf950898..e47ca39176c5e50cf796fe98d009995c8aa4b96c 100644 --- a/doc/html/job_container_plugins.shtml +++ b/doc/html/job_container_plugins.shtml @@ -109,6 +109,10 @@ Job ID.</p> the plugin should return Slurm_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> +<p class="commandline">void container_p_reconfig (void);</p> +<p style="margin-left:.2in"><b>Description</b>: Note change in configuration, +especially the value of the DebugFlags with respect to JobContainer.</p> + <h2>Versioning</h2> <p> This document describes version 101 of the Slurm job container API. Future releases of Slurm may revise this API. A job container plugin diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 3d3dad173425c6db6e6466525bb7fc3ca21a3511..ed87657b534aa9a1e8324fad85075b4591587402 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -445,6 +445,9 @@ Generic resource details \fBGang\fR Gang scheduling details .TP +\fBJobContainer\fR +Job container plugin details +.TP \fBNO_CONF_HASH\fR Do not log when the slurm.conf files differs between SLURM daemons .TP diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 7722ace52700362f5cd9e36fe663cbe3d552159d..4b81f7f6efad7b1f4e9f1588dec83d857756189c 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1935,6 +1935,7 @@ typedef struct reservation_name_msg { #define DEBUG_FLAG_PROFILE 0x00200000 /* AcctGatherProfile plugin */ #define DEBUG_FLAG_INFINIBAND 0x00400000 /* AcctGatherInfiniband plugin */ #define DEBUG_FLAG_FILESYSTEM 0x00800000 /* AcctGatherFilesystem plugin */ +#define DEBUG_FLAG_JOB_CONT 0x01000000 /* JobContainer plugin */ #define GROUP_FORCE 0x8000 /* if set, update group membership * info even if no updates to diff --git a/src/common/read_config.c b/src/common/read_config.c index 13ce0690f711eae64651604141a621b14c87ebdc..815f6f56702f73be09b3ea21d441c10aa5630232 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -3840,6 +3840,11 @@ extern char * debug_flags2str(uint32_t debug_flags) xstrcat(rc, ","); xstrcat(rc, "Filesystem"); } + if (debug_flags & DEBUG_FLAG_JOB_CONT) { + if (rc) + xstrcat(rc, ","); + xstrcat(rc, "JobContainer"); + } if (debug_flags & DEBUG_FLAG_NO_CONF_HASH) { if (rc) xstrcat(rc, ","); @@ -3940,6 +3945,8 @@ extern uint32_t debug_str2flags(char *debug_flags) rc |= DEBUG_FLAG_INFINIBAND; else if (strcasecmp(tok, "Filesystem") == 0) rc |= DEBUG_FLAG_FILESYSTEM; + else if (strcasecmp(tok, "JobContainer") == 0) + rc |= DEBUG_FLAG_JOB_CONT; else if (strcasecmp(tok, "NO_CONF_HASH") == 0) rc |= DEBUG_FLAG_NO_CONF_HASH; else if (strcasecmp(tok, "NoRealTime") == 0) diff --git a/src/common/slurm_xlator.h b/src/common/slurm_xlator.h index 8832724f056ce1ceda95f24896d99fba4b9cf26d..962cc48990f658cd3402a5401f0d5f583a335408 100644 --- a/src/common/slurm_xlator.h +++ b/src/common/slurm_xlator.h @@ -377,6 +377,7 @@ #include "src/common/node_select.h" #include "src/common/pack.h" #include "src/common/parse_config.h" +#include "src/common/read_config.h" #include "src/common/env.h" #include "src/common/slurm_auth.h" #include "src/common/strlcpy.h" diff --git a/src/plugins/job_container/cncu/job_container_cncu.c b/src/plugins/job_container/cncu/job_container_cncu.c index 9b16b032a9ecc889fa041b46042a35ae89fbf9cc..8df2aa94b11db0394473303dca0e3cfededffce9 100644 --- a/src/plugins/job_container/cncu/job_container_cncu.c +++ b/src/plugins/job_container/cncu/job_container_cncu.c @@ -43,18 +43,18 @@ #include <sys/stat.h> #include <fcntl.h> -#include <job.h> /* Cray's job module component */ +//#include <job.h> /* Cray's job module component */ #include "slurm/slurm_errno.h" #include "src/common/slurm_xlator.h" #include "src/slurmd/common/proctrack.h" -#define _DEBUG 0 - #define ADD_FLAGS 0 #define CREATE_FLAGS 0 #define DELETE_FLAGS 0 +#define JOB_BUF_SIZE 128 + /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. @@ -86,12 +86,11 @@ const char plugin_name[] = "job_container cncu plugin"; const char plugin_type[] = "job_container/cncu"; const uint32_t plugin_version = 101; -#define JOB_BUF_SIZE 128 - static uint32_t *job_id_array = NULL; static uint32_t job_id_count = 0; static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER; static char *state_dir = NULL; +static bool enable_debug = false; static int _save_state(char *dir_name) { @@ -184,7 +183,6 @@ static int _restore_state(char *dir_name) return error_code; } -#if _DEBUG static void _stat_reservation(char *type, rid_t resv_id) { struct job_resv_stat buf; @@ -198,7 +196,24 @@ static void _stat_reservation(char *type, rid_t resv_id) buf.num_files, buf.num_ipc_objs); } } -#endif + +static bool _get_debug_flag(void) +{ + if (slurm_get_debug_flags() & DEBUG_FLAG_JOB_CONT) + return true; + return false; +} + +extern void container_p_reconfig(void) +{ + bool new_debug_flag = _get_debug_flag(); + + if (enable_debug != new_debug_flag) { + debug("%s: JobContainer DebugFlag changed to %d", + plugin_name, (int) new_debug_flag); + } + enable_debug = new_debug_flag; +} /* * init() is called when the plugin is loaded, before any other functions @@ -206,11 +221,12 @@ static void _stat_reservation(char *type, rid_t resv_id) */ extern int init(void) { -#if _DEBUG - info("%s loaded", plugin_name); -#else - debug("%s loaded", plugin_name); -#endif + enable_debug = _get_debug_flag(); + if (enable_debug) + info("%s loaded", plugin_name); + else + debug("%s loaded", plugin_name); + return SLURM_SUCCESS; } @@ -255,8 +271,8 @@ extern int container_p_create(uint32_t job_id) int rc; int i, empty = -1, found = -1; bool job_id_change = false; - info("%s: creating(%u)", plugin_type, job_id); + info("%s: creating(%u)", plugin_type, job_id); slurm_mutex_lock(&context_lock); for (i = 0; i < job_id_count; i++) { if (job_id_array[i] == 0) { @@ -288,9 +304,8 @@ extern int container_p_create(uint32_t job_id) error("%s: create(%u): Reservation already exists", plugin_type, job_id); } -#if _DEBUG - _stat_reservation("create", resv_id); -#endif + if (enable_debug) + _stat_reservation("create", resv_id); return SLURM_SUCCESS; } error("%s: create(%u): %m", plugin_type, job_id); @@ -304,9 +319,11 @@ extern int container_p_add_cont(uint32_t job_id, uint64_t cont_id) rid_t resv_id = job_id; int rc; -#if _DEBUG - info("%s: adding cont(%u.%"PRIu64")", plugin_type, job_id, cont_id); -#endif + if (enable_debug) { + info("%s: adding cont(%u.%"PRIu64")", + plugin_type, job_id, cont_id); + } + rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS); if ((rc != 0) && (errno == ENOENT)) { /* Log and retry */ error("%s: add(%u.%"PRIu64"): No reservation found", @@ -315,9 +332,8 @@ extern int container_p_add_cont(uint32_t job_id, uint64_t cont_id) rc = job_attach_reservation(cjob_id, resv_id, ADD_FLAGS); } if (rc == 0) { -#if _DEBUG - _stat_reservation("add", resv_id); -#endif + if (enable_debug) + _stat_reservation("add", resv_id); return SLURM_SUCCESS; } error("%s: add(%u.%"PRIu64"): %m", plugin_type, job_id, cont_id); @@ -329,9 +345,10 @@ extern int container_p_add_pid(uint32_t job_id, pid_t pid, uid_t uid) { stepd_step_rec_t job; -#if _DEBUG - info("%s: adding pid(%u.%u)", plugin_type, job_id, (uint32_t) pid); -#endif + if (enable_debug) { + info("%s: adding pid(%u.%u)", + plugin_type, job_id, (uint32_t) pid); + } memset(&job, 0, sizeof(stepd_step_rec_t)); job.jmgr_pid = pid; job.uid = uid; diff --git a/src/plugins/job_container/none/job_container_none.c b/src/plugins/job_container/none/job_container_none.c index 17306eeacb77546f8ff986a8f152667b92afd4f4..1a438febf6cea3ea4c38168c813c18c0accdd58d 100644 --- a/src/plugins/job_container/none/job_container_none.c +++ b/src/plugins/job_container/none/job_container_none.c @@ -46,7 +46,7 @@ #include "src/common/slurm_xlator.h" #include "src/slurmd/common/proctrack.h" -#define _DEBUG 0 +#define JOB_BUF_SIZE 128 /* * These variables are required by the generic plugin interface. If they @@ -80,9 +80,7 @@ const char plugin_type[] = "job_container/none"; const uint32_t plugin_version = 101; char *state_dir = NULL; /* state save directory */ - -#if _DEBUG -#define JOB_BUF_SIZE 128 +static bool enable_debug = false; static uint32_t *job_id_array = NULL; static uint32_t job_id_count = 0; @@ -178,7 +176,28 @@ static int _restore_state(char *dir_name) return error_code; } -#endif + +static bool _get_debug_flag(void) +{ + if (slurm_get_debug_flags() & DEBUG_FLAG_JOB_CONT) + return true; + return false; +} + +extern void container_p_reconfig(void) +{ + bool new_debug_flag = _get_debug_flag(); + + if (!enable_debug && new_debug_flag) { + error("%s: DebugFlag enabled by reconfiguration, this may " + "result in errors due to missing job cache information", + plugin_name); + } else if (enable_debug != new_debug_flag) { + debug("%s: JobContainer DebugFlag changed to %d", + plugin_name, (int) new_debug_flag); + } + enable_debug = new_debug_flag; +} /* * init() is called when the plugin is loaded, before any other functions @@ -186,11 +205,12 @@ static int _restore_state(char *dir_name) */ extern int init(void) { -#if _DEBUG - info("%s loaded", plugin_name); -#else - debug("%s loaded", plugin_name); -#endif + enable_debug = _get_debug_flag(); + if (enable_debug) + info("%s loaded", plugin_name); + else + debug("%s loaded", plugin_name); + return SLURM_SUCCESS; } @@ -206,25 +226,26 @@ extern int fini(void) extern int container_p_restore(char *dir_name, bool recover) { -#if _DEBUG - int i; - - slurm_mutex_lock(&context_lock); - _restore_state(dir_name); - slurm_mutex_unlock(&context_lock); - for (i = 0; i < job_id_count; i++) { - if (job_id_array[i] == 0) - continue; - if (recover) { - info("%s: recovered job(%u)", - plugin_type, job_id_array[i]); - } else { - info("%s: purging job(%u)", - plugin_type, job_id_array[i]); - job_id_array[i] = 0; + if (enable_debug) { + int i; + + slurm_mutex_lock(&context_lock); + _restore_state(dir_name); + slurm_mutex_unlock(&context_lock); + for (i = 0; i < job_id_count; i++) { + if (job_id_array[i] == 0) + continue; + if (recover) { + info("%s: recovered job(%u)", + plugin_type, job_id_array[i]); + } else { + info("%s: purging job(%u)", + plugin_type, job_id_array[i]); + job_id_array[i] = 0; + } } } -#endif + xfree(state_dir); state_dir = xstrdup(dir_name); return SLURM_SUCCESS; @@ -232,90 +253,97 @@ extern int container_p_restore(char *dir_name, bool recover) extern int container_p_create(uint32_t job_id) { -#if _DEBUG - int i, empty = -1, found = -1; - bool job_id_change = false; - info("%s: creating(%u)", plugin_type, job_id); - - slurm_mutex_lock(&context_lock); - for (i = 0; i < job_id_count; i++) { - if (job_id_array[i] == 0) { - empty = i; - } else if (job_id_array[i] == job_id) { - found = i; - break; + if (enable_debug) { + int i, empty = -1, found = -1; + bool job_id_change = false; + + info("%s: creating(%u)", plugin_type, job_id); + slurm_mutex_lock(&context_lock); + for (i = 0; i < job_id_count; i++) { + if (job_id_array[i] == 0) { + empty = i; + } else if (job_id_array[i] == job_id) { + found = i; + break; + } } - } - if (found == -1) { - if (empty == -1) { - empty = job_id_count; - job_id_count += 4; - job_id_array = xrealloc(job_id_array, - sizeof(uint32_t)*job_id_count); + if (found == -1) { + if (empty == -1) { + empty = job_id_count; + job_id_count += 4; + job_id_array = xrealloc(job_id_array, + sizeof(uint32_t)*job_id_count); + } + job_id_array[empty] = job_id; + job_id_change = true; + } else { + info("%s: duplicate create job(%u)", plugin_type, job_id); } - job_id_array[empty] = job_id; - job_id_change = true; - } else { - info("%s: duplicate create job(%u)", plugin_type, job_id); + if (job_id_change) + _save_state(state_dir); + slurm_mutex_unlock(&context_lock); } - if (job_id_change) - _save_state(state_dir); - slurm_mutex_unlock(&context_lock); -#endif + return SLURM_SUCCESS; } /* Add proctrack container (PAGG) to a job container */ extern int container_p_add_cont(uint32_t job_id, uint64_t cont_id) { -#if _DEBUG - /* This is called from slurmstepd, so the job_id_array is NULL here. - * The array is only set by slurmstepd */ - info("%s: adding cont(%u.%"PRIu64")", plugin_type, job_id, cont_id); -#endif + if (enable_debug) { + /* This is called from slurmstepd, so the job_id_array is NULL + * here.The array is only set by slurmstepd */ + info("%s: adding cont(%u.%"PRIu64")", plugin_type, job_id, + cont_id); + } + return SLURM_SUCCESS; } /* Add a process to a job container, create the proctrack container to add */ extern int container_p_add_pid(uint32_t job_id, pid_t pid, uid_t uid) { -#if _DEBUG - stepd_step_rec_t job; + if (enable_debug) { + stepd_step_rec_t job; - info("%s: adding pid(%u.%u)", plugin_type, job_id, (uint32_t) pid); + info("%s: adding pid(%u.%u)", plugin_type, job_id, + (uint32_t) pid); - memset(&job, 0, sizeof(stepd_step_rec_t)); - job.jmgr_pid = pid; - job.uid = uid; - if (slurm_container_create(&job) != SLURM_SUCCESS) { - error("%s: slurm_container_create job(%u)", plugin_type,job_id); - return SLURM_ERROR; + memset(&job, 0, sizeof(stepd_step_rec_t)); + job.jmgr_pid = pid; + job.uid = uid; + if (slurm_container_create(&job) != SLURM_SUCCESS) { + error("%s: slurm_container_create job(%u)", + plugin_type, job_id); + return SLURM_ERROR; + } + return container_p_add_cont(job_id, job.cont_id); } - return container_p_add_cont(job_id, job.cont_id); -#endif + return SLURM_SUCCESS; } extern int container_p_delete(uint32_t job_id) { -#if _DEBUG - int i, found = -1; - bool job_id_change = false; - - info("%s: deleting(%u)", plugin_type, job_id); - slurm_mutex_lock(&context_lock); - for (i = 0; i < job_id_count; i++) { - if (job_id_array[i] == job_id) { - job_id_array[i] = 0; - job_id_change = true; - found = i; + if (enable_debug) { + int i, found = -1; + bool job_id_change = false; + + info("%s: deleting(%u)", plugin_type, job_id); + slurm_mutex_lock(&context_lock); + for (i = 0; i < job_id_count; i++) { + if (job_id_array[i] == job_id) { + job_id_array[i] = 0; + job_id_change = true; + found = i; + } } + if (found == -1) + info("%s: no job for delete(%u)", plugin_type, job_id); + if (job_id_change) + _save_state(state_dir); + slurm_mutex_unlock(&context_lock); } - if (found == -1) - info("%s: no job for delete(%u)", plugin_type, job_id); - if (job_id_change) - _save_state(state_dir); - slurm_mutex_unlock(&context_lock); -#endif + return SLURM_SUCCESS; } diff --git a/src/slurmd/common/job_container_plugin.c b/src/slurmd/common/job_container_plugin.c index 10b0b6dd08df1522c4c55646c97a1e8f584fa4d9..4922e643bba1eeaad10a9ecd497d1c9aed67dbba 100644 --- a/src/slurmd/common/job_container_plugin.c +++ b/src/slurmd/common/job_container_plugin.c @@ -52,6 +52,7 @@ typedef struct job_container_ops { int (*container_p_add_pid) (uint32_t job_id, pid_t pid, uid_t uid); int (*container_p_delete) (uint32_t job_id); int (*container_p_restore) (char *dir_name, bool recover); + void (*container_p_reconfig) (void); } job_container_ops_t; @@ -64,6 +65,7 @@ static const char *syms[] = { "container_p_add_pid", "container_p_delete", "container_p_restore", + "container_p_reconfig", }; static job_container_ops_t *ops = NULL; @@ -264,3 +266,20 @@ extern int container_g_restore(char * dir_name, bool recover) return rc; } + +/* Note change in configuration (e.g. "DebugFlag=JobContainer" set) */ +extern void container_g_reconfig(void) +{ + int i; + + (void) job_container_init(); + + slurm_mutex_lock(&g_container_context_lock); + for (i = 0; i < g_container_context_num;i++) { + (*(ops[i].container_p_reconfig))(); + } + slurm_mutex_unlock(&g_container_context_lock); + + return; +} + diff --git a/src/slurmd/common/job_container_plugin.h b/src/slurmd/common/job_container_plugin.h index 19e29c878dd0c11029c790f3069feec0cafd0fed..c9f7b243be05bef798130f52474c6b6c0ec04274 100644 --- a/src/slurmd/common/job_container_plugin.h +++ b/src/slurmd/common/job_container_plugin.h @@ -77,4 +77,7 @@ extern int container_g_delete(uint32_t job_id); /* Restore container information */ extern int container_g_restore(char * dir_name, bool recover); +/* Note change in configuration (e.g. "DebugFlag=JobContainer" set) */ +extern void container_g_reconfig(void); + #endif /* _JOB_CONTAINER_PLUGIN_H_ */ diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index eb3656ae416fbdcf9c7ce02caaea567cdb6139eb..289be793d1948a43ef0c81c1d02ab9e2297b14cd 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -996,6 +996,7 @@ _reconfigure(void) gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); + container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt);