Skip to content
Snippets Groups Projects
Commit c2395acb authored by Danny Auble's avatar Danny Auble
Browse files

Fix issue where if a job was needing memory limit enforcement and

the user specifies a polling frequency larger than the default
(Meaning the enforcement would happen at perhaps a much slower pace)
then deny the job.
parent c4848957
No related branches found
No related tags found
No related merge requests found
...@@ -724,6 +724,8 @@ _read_config(void) ...@@ -724,6 +724,8 @@ _read_config(void)
{ {
char *path_pubkey = NULL; char *path_pubkey = NULL;
slurm_ctl_conf_t *cf = NULL; slurm_ctl_conf_t *cf = NULL;
uint16_t tmp16 = 0;
#ifndef HAVE_FRONT_END #ifndef HAVE_FRONT_END
bool cr_flag = false, gang_flag = false; bool cr_flag = false, gang_flag = false;
#endif #endif
...@@ -890,6 +892,13 @@ _read_config(void) ...@@ -890,6 +892,13 @@ _read_config(void)
_free_and_set(&conf->job_acct_gather_freq, _free_and_set(&conf->job_acct_gather_freq,
xstrdup(cf->job_acct_gather_freq)); xstrdup(cf->job_acct_gather_freq));
conf->acct_freq_task = (uint16_t)NO_VAL;
tmp16 = acct_gather_parse_freq(PROFILE_TASK,
conf->job_acct_gather_freq);
if (tmp16 != -1)
conf->acct_freq_task = tmp16;
_free_and_set(&conf->acct_gather_energy_type, _free_and_set(&conf->acct_gather_energy_type,
xstrdup(cf->acct_gather_energy_type)); xstrdup(cf->acct_gather_energy_type));
_free_and_set(&conf->acct_gather_filesystem_type, _free_and_set(&conf->acct_gather_filesystem_type,
......
...@@ -133,6 +133,7 @@ typedef struct slurmd_config { ...@@ -133,6 +133,7 @@ typedef struct slurmd_config {
uint16_t slurmd_timeout; /* SlurmdTimeout */ uint16_t slurmd_timeout; /* SlurmdTimeout */
uid_t slurm_user_id; /* UID that slurmctld runs as */ uid_t slurm_user_id; /* UID that slurmctld runs as */
pthread_mutex_t config_mutex; /* lock for slurmd_config access */ pthread_mutex_t config_mutex; /* lock for slurmd_config access */
uint16_t acct_freq_task;
char *job_acct_gather_freq; char *job_acct_gather_freq;
char *job_acct_gather_type; /* job accounting gather type */ char *job_acct_gather_type; /* job accounting gather type */
char *acct_gather_energy_type; /* */ char *acct_gather_energy_type; /* */
......
...@@ -50,6 +50,7 @@ ...@@ -50,6 +50,7 @@
#include "src/common/cpu_frequency.h" #include "src/common/cpu_frequency.h"
#include "src/common/gres.h" #include "src/common/gres.h"
#include "src/common/slurm_jobacct_gather.h" #include "src/common/slurm_jobacct_gather.h"
#include "src/common/slurm_acct_gather_profile.h"
#include "src/common/slurm_rlimits_info.h" #include "src/common/slurm_rlimits_info.h"
#include "src/common/stepd_api.h" #include "src/common/stepd_api.h"
#include "src/common/switch.h" #include "src/common/switch.h"
...@@ -187,6 +188,7 @@ static slurmd_conf_t * read_slurmd_conf_lite (int fd) ...@@ -187,6 +188,7 @@ static slurmd_conf_t * read_slurmd_conf_lite (int fd)
int len; int len;
Buf buffer; Buf buffer;
slurmd_conf_t *confl; slurmd_conf_t *confl;
int tmp_int = 0;
/* First check to see if we've already initialized the /* First check to see if we've already initialized the
* global slurmd_conf_t in 'conf'. Allocate memory if not. * global slurmd_conf_t in 'conf'. Allocate memory if not.
...@@ -221,6 +223,13 @@ static slurmd_conf_t * read_slurmd_conf_lite (int fd) ...@@ -221,6 +223,13 @@ static slurmd_conf_t * read_slurmd_conf_lite (int fd)
} else } else
confl->log_opts.syslog_level = LOG_LEVEL_QUIET; confl->log_opts.syslog_level = LOG_LEVEL_QUIET;
confl->acct_freq_task = (uint16_t)NO_VAL;
tmp_int = acct_gather_parse_freq(PROFILE_TASK,
confl->job_acct_gather_freq);
if (tmp_int != -1)
confl->acct_freq_task = tmp_int;
return (confl); return (confl);
rwfail: rwfail:
return (NULL); return (NULL);
......
...@@ -76,6 +76,30 @@ static void _job_init_task_info(slurmd_job_t *job, uint32_t *gtid, ...@@ -76,6 +76,30 @@ static void _job_init_task_info(slurmd_job_t *job, uint32_t *gtid,
char *ifname, char *ofname, char *efname); char *ifname, char *ofname, char *efname);
static void _task_info_destroy(slurmd_task_info_t *t, uint16_t multi_prog); static void _task_info_destroy(slurmd_task_info_t *t, uint16_t multi_prog);
static int _check_acct_freq_task(uint32_t job_mem_lim, char *acctg_freq)
{
int task_freq;
if (!job_mem_lim || !conf->acct_freq_task)
return 0;
task_freq = acct_gather_parse_freq(PROFILE_TASK, acctg_freq);
if (task_freq == -1)
return 0;
if ((task_freq == 0) || (task_freq > conf->acct_freq_task)) {
error("Can't set frequency to %u, it is higher than %u. "
"We need it to be at least at this level to "
"monitor memory usage.",
task_freq, conf->acct_freq_task);
slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ);
return 1;
}
return 0;
}
static struct passwd * static struct passwd *
_pwd_create(uid_t uid) _pwd_create(uid_t uid)
{ {
...@@ -180,17 +204,10 @@ job_create(launch_tasks_request_msg_t *msg) ...@@ -180,17 +204,10 @@ job_create(launch_tasks_request_msg_t *msg)
return NULL; return NULL;
} }
/* FIXME: handle this now that acctg_freq is a string */ if (_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq)) {
/* if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) */ _pwd_destroy(pwd);
/* && (msg->acctg_freq > conf->job_acct_gather_freq)) { */ return NULL;
/* error("Can't set frequency to %u, it is higher than %u. " */ }
/* "We need it to be at least at this level to " */
/* "monitor memory usage.", */
/* msg->acctg_freq, conf->job_acct_gather_freq); */
/* slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); */
/* _pwd_destroy(pwd); */
/* return NULL; */
/* } */
job = xmalloc(sizeof(slurmd_job_t)); job = xmalloc(sizeof(slurmd_job_t));
#ifndef HAVE_FRONT_END #ifndef HAVE_FRONT_END
...@@ -381,17 +398,10 @@ job_batch_job_create(batch_job_launch_msg_t *msg) ...@@ -381,17 +398,10 @@ job_batch_job_create(batch_job_launch_msg_t *msg)
return NULL; return NULL;
} }
/* FIXME: handle this now that acctg_freq is a string */ if (_check_acct_freq_task(msg->job_mem, msg->acctg_freq)) {
/* if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) */ _pwd_destroy(pwd);
/* && (msg->acctg_freq > conf->job_acct_gather_freq)) { */ return NULL;
/* error("Can't set frequency to %u, it is higher than %u. " */ }
/* "We need it to be at least at this level to " */
/* "monitor memory usage.", */
/* msg->acctg_freq, conf->job_acct_gather_freq); */
/* slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); */
/* _pwd_destroy(pwd); */
/* return NULL; */
/* } */
job = xmalloc(sizeof(slurmd_job_t)); job = xmalloc(sizeof(slurmd_job_t));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment