diff --git a/src/common/slurm_acct_gather_profile.c b/src/common/slurm_acct_gather_profile.c index da3b5e04b4318d25ed1ac32bde64d5e58a3dac5e..f2e835dd0e55ff3b8ff4c784a1f9283c5ee71ec9 100644 --- a/src/common/slurm_acct_gather_profile.c +++ b/src/common/slurm_acct_gather_profile.c @@ -95,7 +95,11 @@ static bool init_run = false; static int _get_int(const char *my_str) { char *end = NULL; - int value = strtol(my_str, &end, 10); + int value; + + if (!my_str) + return -1; + value = strtol(my_str, &end, 10); //info("from %s I get %d and %s: %m", my_str, value, end); /* means no numbers */ if (my_str == end) diff --git a/src/plugins/acct_gather_energy/ipmi/acct_gather_energy_ipmi.c b/src/plugins/acct_gather_energy/ipmi/acct_gather_energy_ipmi.c index 007f3a9d250c9f2eeac63563266165a5fc2fc404..e02db83e4d7244adce46db3c6d0c3998822f0368 100644 --- a/src/plugins/acct_gather_energy/ipmi/acct_gather_energy_ipmi.c +++ b/src/plugins/acct_gather_energy/ipmi/acct_gather_energy_ipmi.c @@ -544,12 +544,16 @@ static int _update_profile_message() { ipmi_message_profile_t *tmp; int new_size; - slurm_ctl_conf_t *conf_ptr; if (profile_message_memory==0) { - conf_ptr = slurm_conf_lock(); + /* FIXME: now that job_acct_gather_freq is a string + this needs to be changed. This math looks wrong anyway. + */ + /* new_size = 4 * */ + /* (2+ conf_ptr->job_acct_gather_freq /slurm_ipmi_conf.freq); */ new_size = 4 * - (2+ conf_ptr->job_acct_gather_freq /slurm_ipmi_conf.freq); + (2 + acct_gather_profile_timer[PROFILE_TASK].freq + / acct_gather_profile_timer[PROFILE_ENERGY].freq); slurm_conf_unlock(); tmp = (ipmi_message_profile_t *) xmalloc(sizeof(ipmi_message_profile_t)* new_size); diff --git a/src/plugins/job_submit/lua/job_submit_lua.c b/src/plugins/job_submit/lua/job_submit_lua.c index 4204cbb02e8160fcc534386780d61e30de269816..4a0ba6542d0358dd2b67a72317c20fd4a2e5fcf4 100644 --- a/src/plugins/job_submit/lua/job_submit_lua.c +++ b/src/plugins/job_submit/lua/job_submit_lua.c @@ -366,7 +366,7 @@ static int _get_job_req_field (lua_State *L) } else if (!strcmp(name, "account")) { lua_pushstring (L, job_desc->account); } else if (!strcmp(name, "acctg_freq")) { - lua_pushnumber (L, job_desc->acctg_freq); + lua_pushstring (L, job_desc->acctg_freq); } else if (!strcmp(name, "begin_time")) { lua_pushnumber (L, job_desc->begin_time); } else if (!strcmp(name, "comment")) { @@ -473,7 +473,10 @@ static int _set_job_req_field (lua_State *L) if (strlen(value_str)) job_desc->account = xstrdup(value_str); } else if (!strcmp(name, "acctg_freq")) { - job_desc->acctg_freq = luaL_checknumber(L, 3); + value_str = luaL_checkstring(L, 3); + xfree(job_desc->acctg_freq); + if (strlen(value_str)) + job_desc->acctg_freq = xstrdup(value_str); } else if (!strcmp(name, "begin_time")) { job_desc->begin_time = luaL_checknumber(L, 3); } else if (!strcmp(name, "comment")) { diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 35b6ebef445f1193f5cbfd29d716289526f4669b..9682da6fad26e31afb310ca6b03f9a419465a723 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -356,7 +356,7 @@ static void _opt_default() opt.egid = (gid_t) -1; opt.bell = BELL_AFTER_DELAY; - opt.acctg_freq = -1; + opt.acctg_freq = NULL; opt.no_shell = false; opt.get_user_env_time = -1; opt.get_user_env_mode = -1; @@ -387,7 +387,7 @@ struct env_vars { env_vars_t env_vars[] = { {"SALLOC_ACCOUNT", OPT_STRING, &opt.account, NULL }, - {"SALLOC_ACCTG_FREQ", OPT_INT, &opt.acctg_freq, NULL }, + {"SALLOC_ACCTG_FREQ", OPT_STRING, &opt.acctg_freq, NULL }, {"SALLOC_BELL", OPT_BELL, NULL, NULL }, {"SALLOC_CONN_TYPE", OPT_CONN_TYPE, NULL, NULL }, {"SALLOC_CPU_BIND", OPT_CPU_BIND, NULL, NULL }, @@ -1104,7 +1104,8 @@ void set_options(const int argc, char **argv) opt.ramdiskimage = xstrdup(optarg); break; case LONG_OPT_ACCTG_FREQ: - opt.acctg_freq = _get_int(optarg, "acctg-freq"); + xfree(opt.acctg_freq); + opt.acctg_freq = xstrdup(optarg); break; case LONG_OPT_NOSHELL: opt.no_shell = true; diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 84f999745c8c25a05489384cefd29c260e52cd7b..85c7cd94d287d2c3d3af7a6f61cfc9f59d11d5f9 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -407,8 +407,8 @@ int main(int argc, char *argv[]) env_array_append_fmt(&env, "SLURM_OVERCOMMIT", "%d", opt.overcommit); } - if (opt.acctg_freq >= 0) { - env_array_append_fmt(&env, "SLURM_ACCTG_FREQ", "%d", + if (opt.acctg_freq) { + env_array_append_fmt(&env, "SLURM_ACCTG_FREQ", "%s", opt.acctg_freq); } if (opt.network) diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index cb36cd04fe481a02647b05b144e5cd165ea955cd..a5c9828dfae27587a752dbf04035e9239be32909 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -450,7 +450,7 @@ struct env_vars { env_vars_t env_vars[] = { {"SBATCH_ACCOUNT", OPT_STRING, &opt.account, NULL }, {"SBATCH_ARRAY_INX", OPT_STRING, &opt.array_inx, NULL }, - {"SBATCH_ACCTG_FREQ", OPT_INT, &opt.acctg_freq, NULL }, + {"SBATCH_ACCTG_FREQ", OPT_STRING, &opt.acctg_freq, NULL }, {"SBATCH_BLRTS_IMAGE", OPT_STRING, &opt.blrtsimage, NULL }, {"SBATCH_CHECKPOINT", OPT_STRING, &opt.ckpt_interval_str, NULL }, {"SBATCH_CHECKPOINT_DIR",OPT_STRING, &opt.ckpt_dir, NULL }, @@ -2454,8 +2454,8 @@ static bool _opt_verify(void) acct_gather_profile_to_string(opt.profile)); - if (opt.acctg_freq >= 0) - setenvf(NULL, "SLURM_ACCTG_FREQ", "%d", opt.acctg_freq); + if (opt.acctg_freq) + setenvf(NULL, "SLURM_ACCTG_FREQ", "%s", opt.acctg_freq); #ifdef HAVE_AIX if (opt.network == NULL) diff --git a/src/slurmd/common/slurmstepd_init.c b/src/slurmd/common/slurmstepd_init.c index d3def654f8d72e8f09803644c9860841e2059d60..88b3e2f631e36f183b111ddaa1cc722e3c0f1a97 100644 --- a/src/slurmd/common/slurmstepd_init.c +++ b/src/slurmd/common/slurmstepd_init.c @@ -112,6 +112,8 @@ extern int unpack_slurmd_conf_lite_no_alloc(slurmd_conf_t *conf, Buf buffer) unpack_error: error("unpack_error in unpack_slurmd_conf_lite_no_alloc: %m"); + xfree(conf->job_acct_gather_freq); + xfree(conf->job_acct_gather_type); xfree(conf->hostname); xfree(conf->spooldir); xfree(conf->node_name); diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index 3e0a104e2f2ddbc26fcb48ddbcb2c0aa91bfc168..1be7975dbcb630c73bdf3e4cc557249bd05788b8 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -390,11 +390,6 @@ _init_from_slurmd(int sock, char **argv, log_alter(conf->log_opts, 0, conf->logfile); debug2("debug level is %d.", conf->debug_level); - /* FIXME: comment this out when we get this to work like a - * string. It will be handled in - * acct_gather_profile_startpoll when that happens - */ - jobacct_gather_startpoll(conf->job_acct_gather_freq); switch_g_slurmd_step_init(); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index 4c06372d31330d4a5015b03f6016dcbd4c66a339..87e3c92adcceaf795512941845f0ec33cfb1a384 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -180,16 +180,17 @@ job_create(launch_tasks_request_msg_t *msg) return NULL; } - if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) - && (msg->acctg_freq > conf->job_acct_gather_freq)) { - error("Can't set frequency to %u, it is higher than %u. " - "We need it to be at least at this level to " - "monitor memory usage.", - msg->acctg_freq, conf->job_acct_gather_freq); - slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); - _pwd_destroy(pwd); - return NULL; - } + /* FIXME: handle this now that acctg_freq is a string */ + /* if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) */ + /* && (msg->acctg_freq > conf->job_acct_gather_freq)) { */ + /* error("Can't set frequency to %u, it is higher than %u. " */ + /* "We need it to be at least at this level to " */ + /* "monitor memory usage.", */ + /* msg->acctg_freq, conf->job_acct_gather_freq); */ + /* slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); */ + /* _pwd_destroy(pwd); */ + /* return NULL; */ + /* } */ job = xmalloc(sizeof(slurmd_job_t)); #ifndef HAVE_FRONT_END @@ -293,17 +294,14 @@ job_create(launch_tasks_request_msg_t *msg) job->nodeid = nodeid; job->debug = msg->slurmd_debug; job->cpus = msg->cpus_allocated[nodeid]; - if (msg->acctg_freq != (uint16_t) NO_VAL) - jobacct_gather_change_poll(msg->acctg_freq); /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ acct_gather_profile_g_node_step_start(job); - /* FIXME: uncomment this when we get acctg-freq working like a - string */ - //acct_gather_profile_startpoll("Network=10","5"); + acct_gather_profile_startpoll(msg->acctg_freq, + conf->job_acct_gather_freq); job->multi_prog = msg->multi_prog; job->timelimit = (time_t) -1; @@ -382,16 +380,18 @@ job_batch_job_create(batch_job_launch_msg_t *msg) _pwd_destroy(pwd); return NULL; } - if (msg->job_mem && (msg->acctg_freq != (uint16_t) NO_VAL) && - (msg->acctg_freq > conf->job_acct_gather_freq)) { - error("Can't set frequency to %u, it is higher than %u. " - "We need it to be at least at this level to " - "monitor memory usage.", - msg->acctg_freq, conf->job_acct_gather_freq); - slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); - _pwd_destroy(pwd); - return NULL; - } + + /* FIXME: handle this now that acctg_freq is a string */ + /* if (msg->job_mem_lim && (msg->acctg_freq != (uint16_t) NO_VAL) */ + /* && (msg->acctg_freq > conf->job_acct_gather_freq)) { */ + /* error("Can't set frequency to %u, it is higher than %u. " */ + /* "We need it to be at least at this level to " */ + /* "monitor memory usage.", */ + /* msg->acctg_freq, conf->job_acct_gather_freq); */ + /* slurm_seterrno (ESLURMD_INVALID_ACCT_FREQ); */ + /* _pwd_destroy(pwd); */ + /* return NULL; */ + /* } */ job = xmalloc(sizeof(slurmd_job_t)); @@ -407,8 +407,14 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->array_task_id = msg->array_task_id; job->batch = true; - if (msg->acctg_freq != (uint16_t) NO_VAL) - jobacct_gather_change_poll(msg->acctg_freq); + /* This needs to happen before acct_gather_profile_startpoll + and only really looks at the profile in the job. + */ + acct_gather_profile_g_node_step_start(job); + /* needed for the jobacct_gather plugin to start */ + acct_gather_profile_startpoll(msg->acctg_freq, + conf->job_acct_gather_freq); + job->multi_prog = 0; job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index d8d6a908c5e810db3c6bf18e567265b3a0085754..d1ab97712de4814864f81a2e7a0976e529b7ed8d 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -542,7 +542,7 @@ struct env_vars { env_vars_t env_vars[] = { {"SLURMD_DEBUG", OPT_INT, &opt.slurmd_debug, NULL }, {"SLURM_ACCOUNT", OPT_STRING, &opt.account, NULL }, -{"SLURM_ACCTG_FREQ", OPT_INT, &opt.acctg_freq, NULL }, +{"SLURM_ACCTG_FREQ", OPT_STRING, &opt.acctg_freq, NULL }, {"SLURM_BLRTS_IMAGE", OPT_STRING, &opt.blrtsimage, NULL }, {"SLURM_CHECKPOINT", OPT_STRING, &opt.ckpt_interval_str, NULL }, {"SLURM_CHECKPOINT_DIR",OPT_STRING, &opt.ckpt_dir, NULL },