From 673b60794d2fc58ebfcfb19c5fd59edf81bcb143 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 23 Sep 2003 16:46:38 +0000 Subject: [PATCH] Define (and use) minimum and maximum job id to use for no_allocate jobs. These jobs are reported by slurmd on node registration. They are logged but otherwise ignored by slurmctld. Several changes to slurmd logging messaged to report job id and step id using %u format rather than %d format (which shows no-allocate job id values as negative numbers). --- src/common/slurm_protocol_api.h | 3 +++ src/slurmctld/job_mgr.c | 13 +++++++++++-- src/slurmd/job.c | 4 ++-- src/slurmd/shm.c | 2 +- src/slurmd/slurmd.c | 6 +++--- src/srun/job.c | 6 +++--- 6 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 805eb9fd2a6..5e6c05db492 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -51,6 +51,9 @@ #include "src/common/slurm_protocol_defs.h" #include "src/common/slurm_protocol_util.h" +#define MIN_NOALLOC_JOBID ((uint32_t) 0xffff0000) +#define MAX_NOALLOC_JOBID ((uint32_t) 0xfffffffd) + enum controller_id { PRIMARY_CONTROLLER = 1, SECONDARY_CONTROLLER = 2 diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 167cb8e6328..5bd973e82e3 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2432,9 +2432,11 @@ static void _set_job_id(struct job_record *job_ptr) || (strlen(job_ptr->partition) == 0)) fatal("_set_job_id: partition not set"); - /* Include below code only if fear of rolling over 32 bit job IDs */ + /* Insure no conflict in job id if we roll over 32 bits */ while (1) { - new_id = job_id_sequence++; + if (++job_id_sequence >= MIN_NOALLOC_JOBID) + job_id_sequence = slurmctld_conf.first_job_id; + new_id = job_id_sequence; if (find_job_record(new_id) == NULL) { job_ptr->job_id = new_id; break; @@ -2779,6 +2781,13 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, /* Check that jobs running are really supposed to be there */ for (i = 0; i < *job_count; i++) { + if ( (job_id_ptr[i] >= MIN_NOALLOC_JOBID) && + (job_id_ptr[i] <= MAX_NOALLOC_JOBID) ) { + info("NoAllocate job %u.%u reported on node %s", + job_id_ptr[i], step_id_ptr[i], node_name); + continue; + } + job_ptr = find_job_record(job_id_ptr[i]); if (job_ptr == NULL) { error("Orphan job %u.%u reported on node %s", diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 886dff2735e..6e3c78f2cb7 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -440,9 +440,9 @@ job_update_shm(slurmd_job_t *job) return SLURM_ERROR; if (job->stepid == NO_VAL) - debug("updated shm with job %d", job->jobid); + debug("updated shm with job %u", job->jobid); else - debug("updated shm with step %d.%d", job->jobid, job->stepid); + debug("updated shm with step %u.%u", job->jobid, job->stepid); return SLURM_SUCCESS; } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 756c86939c0..08c55f47d85 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -175,7 +175,7 @@ shm_fini(void) for (i = 0; i < MAX_JOB_STEPS; i++) { if (slurmd_shm->step[i].state > SLURMD_JOB_UNUSED) { job_step_t *s = &slurmd_shm->step[i]; - info ("Used shm slot: %d %d\n", s->jobid, s->stepid); + info ("Used shm slot: %u %u\n", s->jobid, s->stepid); } } diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 51769c7eb52..b187dc6ea57 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -369,16 +369,16 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) n = 0; while ((s = list_next(i))) { if (!shm_step_still_running(s->jobid, s->stepid)) { - debug("deleting stale reference to %d.%d in shm", + debug("deleting stale reference to %u.%u in shm", s->jobid, (int32_t) s->stepid); shm_delete_step(s->jobid, s->stepid); --(msg->job_count); continue; } if (s->stepid == NO_VAL) - debug("found apparently running job %d", s->jobid); + debug("found apparently running job %u", s->jobid); else - debug("found apparently running step %d.%d", + debug("found apparently running step %u.%u", s->jobid, s->stepid); msg->job_id[n] = s->jobid; msg->step_id[n] = s->stepid; diff --git a/src/srun/job.c b/src/srun/job.c index 5c532d01112..57d8ae95cfb 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -140,10 +140,10 @@ job_create_noalloc(void) } srand48(getpid()); + info->jobid = MIN_NOALLOC_JOBID + + ((uint32_t) lrand48() % + (MAX_NOALLOC_JOBID - MIN_NOALLOC_JOBID + 1)); info->stepid = (uint32_t) (lrand48()); - info->jobid = (uint32_t) (lrand48()); - if (info->jobid == 0) - info->jobid = 1; info->nodelist = opt.nodelist; info->nnodes = hostlist_count(hl); -- GitLab