diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 805eb9fd2a60d408e2241ce84a25ca19c5621e38..5e6c05db492342bf47e62bb3a5b6b675b43f30e2 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -51,6 +51,9 @@ #include "src/common/slurm_protocol_defs.h" #include "src/common/slurm_protocol_util.h" +#define MIN_NOALLOC_JOBID ((uint32_t) 0xffff0000) +#define MAX_NOALLOC_JOBID ((uint32_t) 0xfffffffd) + enum controller_id { PRIMARY_CONTROLLER = 1, SECONDARY_CONTROLLER = 2 diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 167cb8e63285b5af63e8c503eafb4a13d56d024f..5bd973e82e3e2353809cb205ad0696eefc5d23c5 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2432,9 +2432,11 @@ static void _set_job_id(struct job_record *job_ptr) || (strlen(job_ptr->partition) == 0)) fatal("_set_job_id: partition not set"); - /* Include below code only if fear of rolling over 32 bit job IDs */ + /* Insure no conflict in job id if we roll over 32 bits */ while (1) { - new_id = job_id_sequence++; + if (++job_id_sequence >= MIN_NOALLOC_JOBID) + job_id_sequence = slurmctld_conf.first_job_id; + new_id = job_id_sequence; if (find_job_record(new_id) == NULL) { job_ptr->job_id = new_id; break; @@ -2779,6 +2781,13 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, /* Check that jobs running are really supposed to be there */ for (i = 0; i < *job_count; i++) { + if ( (job_id_ptr[i] >= MIN_NOALLOC_JOBID) && + (job_id_ptr[i] <= MAX_NOALLOC_JOBID) ) { + info("NoAllocate job %u.%u reported on node %s", + job_id_ptr[i], step_id_ptr[i], node_name); + continue; + } + job_ptr = find_job_record(job_id_ptr[i]); if (job_ptr == NULL) { error("Orphan job %u.%u reported on node %s", diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 886dff2735e4f26560494b684781b26936f13e03..6e3c78f2cb7019099a735706f1c167621a351d51 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -440,9 +440,9 @@ job_update_shm(slurmd_job_t *job) return SLURM_ERROR; if (job->stepid == NO_VAL) - debug("updated shm with job %d", job->jobid); + debug("updated shm with job %u", job->jobid); else - debug("updated shm with step %d.%d", job->jobid, job->stepid); + debug("updated shm with step %u.%u", job->jobid, job->stepid); return SLURM_SUCCESS; } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 756c86939c0ed08632257e6ab65dab530e33c71a..08c55f47d85ff272e467ade680b586cab06bf10d 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -175,7 +175,7 @@ shm_fini(void) for (i = 0; i < MAX_JOB_STEPS; i++) { if (slurmd_shm->step[i].state > SLURMD_JOB_UNUSED) { job_step_t *s = &slurmd_shm->step[i]; - info ("Used shm slot: %d %d\n", s->jobid, s->stepid); + info ("Used shm slot: %u %u\n", s->jobid, s->stepid); } } diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 51769c7eb52a21ad47845096b2bbb8c3bff5bf26..b187dc6ea57ef6f281173d084cee992e68833268 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -369,16 +369,16 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) n = 0; while ((s = list_next(i))) { if (!shm_step_still_running(s->jobid, s->stepid)) { - debug("deleting stale reference to %d.%d in shm", + debug("deleting stale reference to %u.%u in shm", s->jobid, (int32_t) s->stepid); shm_delete_step(s->jobid, s->stepid); --(msg->job_count); continue; } if (s->stepid == NO_VAL) - debug("found apparently running job %d", s->jobid); + debug("found apparently running job %u", s->jobid); else - debug("found apparently running step %d.%d", + debug("found apparently running step %u.%u", s->jobid, s->stepid); msg->job_id[n] = s->jobid; msg->step_id[n] = s->stepid; diff --git a/src/srun/job.c b/src/srun/job.c index 5c532d01112faa301506e7978c2ee1ef3b93743c..57d8ae95cfbd276ac6a5483bd262a02d03fe3bf5 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -140,10 +140,10 @@ job_create_noalloc(void) } srand48(getpid()); + info->jobid = MIN_NOALLOC_JOBID + + ((uint32_t) lrand48() % + (MAX_NOALLOC_JOBID - MIN_NOALLOC_JOBID + 1)); info->stepid = (uint32_t) (lrand48()); - info->jobid = (uint32_t) (lrand48()); - if (info->jobid == 0) - info->jobid = 1; info->nodelist = opt.nodelist; info->nnodes = hostlist_count(hl);