diff --git a/RELEASE_NOTES b/RELEASE_NOTES index b2f4e20ae81d383beb78cd6e556351ab76eecde1..263201e60081924d7b4591987bcd9713c646d346 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -45,15 +45,15 @@ BLUEGENE SPECIFIC CHANGES API CHANGES -* General changes: - -* Changed the following struct definitions: +* Changed members of the following structs: +job_info_t + num_procs -> num_cpus +job_desc_msg_t + num_procs -> min_cpus * Added the following struct definitions: - -* Renamed - -* Changed members of the following structs: +job_desc_msg_t + max_cpus * Changed the following enums diff --git a/contribs/perlapi/libslurm-perl/alloc.c b/contribs/perlapi/libslurm-perl/alloc.c index 4d95b2354ed27abf272893a5d98492257c81e41d..dba2ac194566a126e9c6f58a7448b62a612c01fd 100644 --- a/contribs/perlapi/libslurm-perl/alloc.c +++ b/contribs/perlapi/libslurm-perl/alloc.c @@ -26,7 +26,7 @@ hv_to_job_desc_msg(HV* hv, job_desc_msg_t* job_desc_msg) I32 klen; STRLEN vlen; int num_keys, i; - + slurm_init_job_desc_msg(job_desc_msg); FETCH_FIELD(hv, job_desc_msg, contiguous, uint16_t, FALSE); @@ -38,7 +38,7 @@ hv_to_job_desc_msg(HV* hv, job_desc_msg_t* job_desc_msg) num_keys = HvKEYS(environ_hv); job_desc_msg->env_size = num_keys; Newz(0, job_desc_msg->environment, num_keys + 1, char*); - + hv_iterinit(environ_hv); i = 0; while((val = hv_iternextsv(environ_hv, &env_key, &klen))) { @@ -66,7 +66,8 @@ hv_to_job_desc_msg(HV* hv, job_desc_msg_t* job_desc_msg) FETCH_FIELD(hv, job_desc_msg, exc_nodes, charp, FALSE); FETCH_FIELD(hv, job_desc_msg, shared, uint16_t, FALSE); FETCH_FIELD(hv, job_desc_msg, time_limit, uint32_t, FALSE); - FETCH_FIELD(hv, job_desc_msg, num_procs, uint32_t, FALSE); + FETCH_FIELD(hv, job_desc_msg, min_cpus, uint32_t, FALSE); + FETCH_FIELD(hv, job_desc_msg, max_cpus, uint32_t, FALSE); FETCH_FIELD(hv, job_desc_msg, min_nodes, uint32_t, FALSE); FETCH_FIELD(hv, job_desc_msg, max_nodes, uint32_t, FALSE); FETCH_FIELD(hv, job_desc_msg, min_sockets, uint16_t, FALSE); @@ -156,7 +157,7 @@ hv_to_job_desc_msg(HV* hv, job_desc_msg_t* job_desc_msg) } /* - * free allocated environment variable memory for job_desc_msg_t + * free allocated environment variable memory for job_desc_msg_t */ static void _free_environment(char** environ) @@ -188,7 +189,7 @@ resource_allocation_response_msg_to_hv(resource_allocation_response_msg_t* resp_ { AV* avp; int i; - + STORE_FIELD(hv, resp_msg, job_id, uint32_t); if(resp_msg->node_list) STORE_FIELD(hv, resp_msg, node_list, charp); @@ -199,7 +200,7 @@ resource_allocation_response_msg_to_hv(resource_allocation_response_msg_t* resp_ av_store(avp, i, newSVuv(resp_msg->cpus_per_node[i])); } hv_store_sv(hv, "cpus_per_node", newRV_noinc((SV*)avp)); - + avp = newAV(); for(i = 0; i < resp_msg->num_cpu_groups; i ++) { av_store(avp, i, newSVuv(resp_msg->cpu_count_reps[i])); @@ -220,7 +221,7 @@ job_alloc_info_response_msg_to_hv(job_alloc_info_response_msg_t *resp_msg, HV* h { AV* avp; int i; - + STORE_FIELD(hv, resp_msg, job_id, uint32_t); if(resp_msg->node_list) STORE_FIELD(hv, resp_msg, node_list, charp); @@ -231,7 +232,7 @@ job_alloc_info_response_msg_to_hv(job_alloc_info_response_msg_t *resp_msg, HV* h av_store(avp, i, newSVuv(resp_msg->cpus_per_node[i])); } hv_store_sv(hv, "cpus_per_node", newRV_noinc((SV*)avp)); - + avp = newAV(); for(i = 0; i < resp_msg->num_cpu_groups; i ++) { av_store(avp, i, newSVuv(resp_msg->cpu_count_reps[i])); @@ -263,4 +264,3 @@ submit_response_msg_to_hv(submit_response_msg_t *resp_msg, HV* hv) STORE_FIELD(hv, resp_msg, error_code, uint32_t); return 0; } - diff --git a/contribs/perlapi/libslurm-perl/job.c b/contribs/perlapi/libslurm-perl/job.c index 1e8454dd3f7ea614e697290a149501ea8251d9d9..0909c2618901abc7db6a2e528b8c4b6f6be11e83 100644 --- a/contribs/perlapi/libslurm-perl/job.c +++ b/contribs/perlapi/libslurm-perl/job.c @@ -78,7 +78,7 @@ job_info_to_hv(job_info_t* job_info, HV* hv) STORE_FIELD(hv, job_info, ntasks_per_node, uint16_t); STORE_FIELD(hv, job_info, ntasks_per_socket, uint16_t); STORE_FIELD(hv, job_info, num_nodes, uint32_t); - STORE_FIELD(hv, job_info, num_procs, uint32_t); + STORE_FIELD(hv, job_info, num_cpus, uint32_t); if(job_info->partition) STORE_FIELD(hv, job_info, partition, charp); STORE_FIELD(hv, job_info, pre_sus_time, time_t); diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 3c59eda49ddae49ddeaf6db72d791be5cb4db19e..3d0ad15814f4a277c7b20e24e142a09af0f4293f 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -616,7 +616,9 @@ typedef struct job_descriptor { /* For submit, allocate, and update requests */ * real memory per CPU | MEM_PER_CPU, * default=0 (no limit) */ uint32_t job_min_tmp_disk; /* minimum tmp disk per node, default=0 */ - uint32_t num_procs; /* total count of processors required, + uint32_t min_cpus; /* minimum number of processors required, + * default=0 */ + uint32_t max_cpus; /* maximum number of processors required, * default=0 */ uint32_t min_nodes; /* minimum number of nodes required by job, * default=0 */ @@ -687,6 +689,7 @@ typedef struct job_info { uint32_t job_min_tmp_disk; /* minimum tmp disk per node, default=0 */ uint16_t job_state; /* state of the job, see enum job_states */ char *licenses; /* licenses required by the job */ + uint32_t max_cpus; /* maximum number of cpus usable by job */ uint32_t max_nodes; /* maximum number of nodes usable by job */ uint16_t min_cores; /* minimum number of cores per cpu */ uint16_t min_sockets; /* minimum number of sockets per node */ @@ -703,7 +706,7 @@ typedef struct job_info { uint16_t ntasks_per_socket;/* number of tasks to invoke on each socket */ uint32_t num_nodes; /* minimum number of nodes required by job */ - uint32_t num_procs; /* number of processors required by job */ + uint32_t num_cpus; /* minimum number of cpus required by job */ char *partition; /* name of assigned partition */ time_t pre_sus_time; /* time job ran prior to last suspend */ uint32_t priority; /* relative priority of the job, diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index a700a7c57cd392b4dd12f82e0acce0a293903ecd..701c8c15ee577fb75e50878599ade967d82b55b3 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -180,6 +180,7 @@ enum { ESLURM_INVALID_QOS, ESLURM_QOS_PREEMPTION_LOOP, ESLURM_NODE_NOT_AVAIL, + ESLURM_INVALID_CPU_COUNT, /* switch specific error codes, specific values defined in plugin module */ ESLURM_SWITCH_MIN = 3000, diff --git a/src/api/init_msg.c b/src/api/init_msg.c index 00e940e75a1e00ed8d8c733f24f54094064a9a1e..6a4c498b9428701ea5bbe088ba9ab9cbfe53ceb0 100644 --- a/src/api/init_msg.c +++ b/src/api/init_msg.c @@ -76,9 +76,11 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg) job_desc_msg->job_min_memory = NO_VAL; job_desc_msg->job_min_tmp_disk = NO_VAL; job_desc_msg->kill_on_node_fail = (uint16_t) NO_VAL; + job_desc_msg->max_cpus = NO_VAL; job_desc_msg->max_nodes = NO_VAL; job_desc_msg->mem_bind_type = (uint16_t) NO_VAL; job_desc_msg->min_cores = (uint16_t) NO_VAL; + job_desc_msg->min_cpus = NO_VAL; job_desc_msg->min_nodes = NO_VAL; job_desc_msg->min_sockets = (uint16_t) NO_VAL; job_desc_msg->min_threads = (uint16_t) NO_VAL; @@ -86,7 +88,6 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg) job_desc_msg->ntasks_per_core = (uint16_t) NO_VAL; job_desc_msg->ntasks_per_node = (uint16_t) NO_VAL; job_desc_msg->ntasks_per_socket = (uint16_t) NO_VAL; - job_desc_msg->num_procs = NO_VAL; job_desc_msg->num_tasks = NO_VAL; job_desc_msg->overcommit = (uint8_t) NO_VAL; job_desc_msg->plane_size = (uint16_t) NO_VAL; diff --git a/src/api/job_info.c b/src/api/job_info.c index a8f7bfb5accdf9f547b801cbb6c1359d97dc11e0..5e74bd71171055dbbbe247571b30633b26737e66 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -365,8 +365,6 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) /****** Line 13 ******/ #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_procs, tmp1, sizeof(tmp1), - UNIT_NONE); select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); @@ -376,10 +374,10 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) } else max_nodes = min_nodes; #else - snprintf(tmp1, sizeof(tmp1), "%u", job_ptr->num_procs); min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; #endif + _sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus); _sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes); snprintf(tmp_line, sizeof(tmp_line), "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%u:%u:%u", diff --git a/src/common/env.c b/src/common/env.c index 82d16f9005d6f1be11fd6de00c77d0ad50e1b4ea..0310da7493100ccd5cad7ceef2b52e302d669a80 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -999,7 +999,7 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc, if((int)desc->cpus_per_task > 1 && desc->cpus_per_task != (uint16_t)NO_VAL) num_tasks /= desc->cpus_per_task; - //num_tasks = desc->num_procs; + //num_tasks = desc->min_cpus; } if(desc->task_dist == SLURM_DIST_ARBITRARY) { diff --git a/src/common/slurm_accounting_storage.c b/src/common/slurm_accounting_storage.c index fd14f14d2f55fb25fc80b895763db43bc46ad5d1..b1324153adbf94ae16e006644179cbeebfa0ff61 100644 --- a/src/common/slurm_accounting_storage.c +++ b/src/common/slurm_accounting_storage.c @@ -161,9 +161,9 @@ typedef struct slurm_acct_storage_ops { char *cluster, struct node_record *node_ptr, time_t event_time); - int (*cluster_procs) (void *db_conn, - char *cluster, char *cluster_nodes, - uint32_t procs, time_t event_time); + int (*cluster_cpus) (void *db_conn, + char *cluster, char *cluster_nodes, + uint32_t cpus, time_t event_time); int (*c_get_usage) (void *db_conn, uint32_t uid, void *cluster_rec, int type, time_t start, time_t end); @@ -265,7 +265,7 @@ static slurm_acct_storage_ops_t * _acct_storage_get_ops( "acct_storage_p_roll_usage", "clusteracct_storage_p_node_down", "clusteracct_storage_p_node_up", - "clusteracct_storage_p_cluster_procs", + "clusteracct_storage_p_cluster_cpus", "clusteracct_storage_p_get_usage", "clusteracct_storage_p_register_ctld", "jobacct_storage_p_job_start", @@ -2440,7 +2440,7 @@ extern void pack_acct_event_rec(void *in, uint16_t rpc_version, Buf buffer) } extern int unpack_acct_event_rec(void **object, uint16_t rpc_version, - Buf buffer) + Buf buffer) { uint32_t uint32_tmp; acct_event_rec_t *object_ptr = xmalloc(sizeof(acct_event_rec_t)); @@ -8621,16 +8621,16 @@ extern int clusteracct_storage_g_node_up(void *db_conn, } -extern int clusteracct_storage_g_cluster_procs(void *db_conn, - char *cluster, - char *cluster_nodes, - uint32_t procs, - time_t event_time) +extern int clusteracct_storage_g_cluster_cpus(void *db_conn, + char *cluster, + char *cluster_nodes, + uint32_t cpus, + time_t event_time) { if (slurm_acct_storage_init(NULL) < 0) return SLURM_ERROR; - return (*(g_acct_storage_context->ops.cluster_procs)) - (db_conn, cluster, cluster_nodes, procs, event_time); + return (*(g_acct_storage_context->ops.cluster_cpus)) + (db_conn, cluster, cluster_nodes, cpus, event_time); } diff --git a/src/common/slurm_accounting_storage.h b/src/common/slurm_accounting_storage.h index 5c8af240e3dabc1169c3f51a102382b74eca4060..1cc71b59a75ddb9bfca81d613b70d7cbc6d7db69 100644 --- a/src/common/slurm_accounting_storage.h +++ b/src/common/slurm_accounting_storage.h @@ -1132,11 +1132,11 @@ extern int clusteracct_storage_g_node_up(void *db_conn, struct node_record *node_ptr, time_t event_time); -extern int clusteracct_storage_g_cluster_procs(void *db_conn, - char *cluster, - char *cluster_nodes, - uint32_t procs, - time_t event_time); +extern int clusteracct_storage_g_cluster_cpus(void *db_conn, + char *cluster, + char *cluster_nodes, + uint32_t cpus, + time_t event_time); extern int clusteracct_storage_g_register_ctld( void *db_conn, char *cluster, uint16_t port); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index b33190ba45b84f67727976ad01b769d8362d3b10..afd730c5ed21d36d09135524c931d77230be0843 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -262,6 +262,8 @@ static slurm_errtab_t slurm_errtab[] = { "QOS Preemption loop detected" }, { ESLURM_NODE_NOT_AVAIL, "Required node not available (down or drained)" }, + { ESLURM_INVALID_CPU_COUNT, + "CPU count specification invalid" }, /* slurmd error codes */ diff --git a/src/common/slurm_protocol_common.h b/src/common/slurm_protocol_common.h index 7cb00170c8926825fe6310dd9858805203c515e2..c216bfd9120781323ea1aa5fc412c710b103462e 100644 --- a/src/common/slurm_protocol_common.h +++ b/src/common/slurm_protocol_common.h @@ -68,6 +68,7 @@ * In slurm_protocol_util.c check_header_version(), and init_header() * need to be updated also when changes are added */ #define SLURM_PROTOCOL_VERSION ((SLURM_API_MAJOR << 8) | SLURM_API_AGE) +#define SLURM_2_2_PROTOCOL_VERSION SLURM_PROTOCOL_VERSION #define SLURM_2_1_PROTOCOL_VERSION ((21 << 8) | 0) #define SLURM_2_0_PROTOCOL_VERSION ((20 << 8) | 0) #define SLURM_1_3_PROTOCOL_VERSION ((13 << 8) | 0) diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 4dc26fb8f1847e61961b417993ed25f7b7ad744d..d0f16961ec9b81ec5512c97474a6d350cd177d37 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3199,7 +3199,7 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, char *node_inx_str; multi_core_data_t *mc_ptr; - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { safe_unpack32(&job->assoc_id, buffer); safe_unpack32(&job->job_id, buffer); safe_unpack32(&job->user_id, buffer); @@ -3248,7 +3248,6 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, job->node_inx = bitfmt2int(node_inx_str); xfree(node_inx_str); } - safe_unpack32(&job->num_procs, buffer); if (select_g_select_jobinfo_unpack(&job->select_jobinfo, buffer, protocol_version)) @@ -3260,6 +3259,109 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->dependency, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&job->command, &uint32_tmp, buffer); + safe_unpack32(&job->num_cpus, buffer); + safe_unpack32(&job->max_cpus, buffer); + safe_unpack32(&job->num_nodes, buffer); + safe_unpack32(&job->max_nodes, buffer); + safe_unpack16(&job->requeue, buffer); + + /*** unpack pending job details ***/ + safe_unpack16(&job->shared, buffer); + safe_unpack16(&job->contiguous, buffer); + safe_unpack16(&job->cpus_per_task, buffer); + safe_unpack16(&job->job_min_cpus, buffer); + + safe_unpack32(&job->job_min_memory, buffer); + safe_unpack32(&job->job_min_tmp_disk, buffer); + + safe_unpackstr_xmalloc(&job->req_nodes, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&node_inx_str, &uint32_tmp, buffer); + if (node_inx_str == NULL) + job->req_node_inx = bitfmt2int(""); + else { + job->req_node_inx = bitfmt2int(node_inx_str); + xfree(node_inx_str); + } + safe_unpackstr_xmalloc(&job->exc_nodes, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&node_inx_str, &uint32_tmp, buffer); + if (node_inx_str == NULL) + job->exc_node_inx = bitfmt2int(""); + else { + job->exc_node_inx = bitfmt2int(node_inx_str); + xfree(node_inx_str); + } + + if (unpack_multi_core_data(&mc_ptr, buffer)) + goto unpack_error; + if (mc_ptr) { + job->min_sockets = mc_ptr->min_sockets; + job->min_cores = mc_ptr->min_cores; + job->min_threads = mc_ptr->min_threads; + job->ntasks_per_socket = mc_ptr->ntasks_per_socket; + job->ntasks_per_core = mc_ptr->ntasks_per_core; + xfree(mc_ptr); + } + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + safe_unpack32(&job->assoc_id, buffer); + safe_unpack32(&job->job_id, buffer); + safe_unpack32(&job->user_id, buffer); + safe_unpack32(&job->group_id, buffer); + + safe_unpack16(&job->job_state, buffer); + safe_unpack16(&job->batch_flag, buffer); + safe_unpack16(&job->state_reason, buffer); + safe_unpack16(&job->restart_cnt, buffer); + + safe_unpack32(&job->alloc_sid, buffer); + safe_unpack32(&job->time_limit, buffer); + + safe_unpack16(&job->nice, buffer); + + safe_unpack_time(&job->submit_time, buffer); + safe_unpack_time(&job->eligible_time, buffer); + safe_unpack_time(&job->start_time, buffer); + safe_unpack_time(&job->end_time, buffer); + safe_unpack_time(&job->suspend_time, buffer); + safe_unpack_time(&job->pre_sus_time, buffer); + + safe_unpack32(&job->priority, buffer); + + safe_unpackstr_xmalloc(&job->nodes, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->partition, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->account, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->network, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->comment, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->qos, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->licenses, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->state_desc, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->resv_name, &uint32_tmp, buffer); + + safe_unpack32(&job->exit_code, buffer); + unpack_job_resources(&job->job_resrcs, job->nodes, buffer, + protocol_version); + + safe_unpackstr_xmalloc(&job->name, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->wckey, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->alloc_node, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&node_inx_str, &uint32_tmp, buffer); + if (node_inx_str == NULL) + job->node_inx = bitfmt2int(""); + else { + job->node_inx = bitfmt2int(node_inx_str); + xfree(node_inx_str); + } + safe_unpack32(&job->num_cpus, buffer); + + if (select_g_select_jobinfo_unpack(&job->select_jobinfo, + buffer, protocol_version)) + goto unpack_error; + + /*** unpack default job details ***/ + safe_unpackstr_xmalloc(&job->features, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->work_dir, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->dependency, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job->command, &uint32_tmp, buffer); + safe_unpack32(&job->num_nodes, buffer); safe_unpack32(&job->max_nodes, buffer); safe_unpack16(&job->requeue, buffer); @@ -3793,7 +3895,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer, uint16_t protocol_version) { /* load the data values */ - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { pack16(job_desc_ptr->contiguous, buffer); pack16(job_desc_ptr->task_dist, buffer); pack16(job_desc_ptr->kill_on_node_fail, buffer); @@ -3851,7 +3953,147 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer, packstr(job_desc_ptr->mem_bind, buffer); pack32(job_desc_ptr->time_limit, buffer); - pack32(job_desc_ptr->num_procs, buffer); + pack32(job_desc_ptr->min_cpus, buffer); + pack32(job_desc_ptr->max_cpus, buffer); + pack32(job_desc_ptr->min_nodes, buffer); + pack32(job_desc_ptr->max_nodes, buffer); + pack16(job_desc_ptr->min_sockets, buffer); + pack16(job_desc_ptr->min_cores, buffer); + pack16(job_desc_ptr->min_threads, buffer); + pack32(job_desc_ptr->user_id, buffer); + pack32(job_desc_ptr->group_id, buffer); + + pack16(job_desc_ptr->alloc_resp_port, buffer); + pack16(job_desc_ptr->other_port, buffer); + packstr(job_desc_ptr->network, buffer); + pack_time(job_desc_ptr->begin_time, buffer); + pack_time(job_desc_ptr->end_time, buffer); + + packstr(job_desc_ptr->licenses, buffer); + pack16(job_desc_ptr->mail_type, buffer); + packstr(job_desc_ptr->mail_user, buffer); + packstr(job_desc_ptr->reservation, buffer); + pack16(job_desc_ptr->warn_signal, buffer); + pack16(job_desc_ptr->warn_time, buffer); + packstr(job_desc_ptr->wckey, buffer); + + if(job_desc_ptr->select_jobinfo) { + select_g_select_jobinfo_pack( + job_desc_ptr->select_jobinfo, + buffer, protocol_version); + } else { + job_desc_ptr->select_jobinfo = + select_g_select_jobinfo_alloc(); +#ifdef HAVE_BG + if(job_desc_ptr->geometry[0] != (uint16_t) NO_VAL) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_GEOMETRY, + job_desc_ptr->geometry); +#endif + + if (job_desc_ptr->conn_type != (uint16_t) NO_VAL) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_CONN_TYPE, + &(job_desc_ptr->conn_type)); + if (job_desc_ptr->reboot != (uint16_t) NO_VAL) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_REBOOT, + &(job_desc_ptr->reboot)); + if (job_desc_ptr->rotate != (uint16_t) NO_VAL) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_ROTATE, + &(job_desc_ptr->rotate)); + if (job_desc_ptr->blrtsimage) { + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_BLRTS_IMAGE, + job_desc_ptr->blrtsimage); + } + if (job_desc_ptr->linuximage) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_LINUX_IMAGE, + job_desc_ptr->linuximage); + if (job_desc_ptr->mloaderimage) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_MLOADER_IMAGE, + job_desc_ptr->mloaderimage); + if (job_desc_ptr->ramdiskimage) + select_g_select_jobinfo_set( + job_desc_ptr->select_jobinfo, + SELECT_JOBDATA_RAMDISK_IMAGE, + job_desc_ptr->ramdiskimage); + select_g_select_jobinfo_pack( + job_desc_ptr->select_jobinfo, + buffer, protocol_version); + select_g_select_jobinfo_free( + job_desc_ptr->select_jobinfo); + job_desc_ptr->select_jobinfo = NULL; + } + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + pack16(job_desc_ptr->contiguous, buffer); + pack16(job_desc_ptr->task_dist, buffer); + pack16(job_desc_ptr->kill_on_node_fail, buffer); + packstr(job_desc_ptr->features, buffer); + pack32(job_desc_ptr->job_id, buffer); + packstr(job_desc_ptr->name, buffer); + + packstr(job_desc_ptr->alloc_node, buffer); + pack32(job_desc_ptr->alloc_sid, buffer); + pack16(job_desc_ptr->job_min_cpus, buffer); + pack32(job_desc_ptr->job_min_memory, buffer); + pack32(job_desc_ptr->job_min_tmp_disk, buffer); + + packstr(job_desc_ptr->partition, buffer); + pack32(job_desc_ptr->priority, buffer); + packstr(job_desc_ptr->dependency, buffer); + packstr(job_desc_ptr->account, buffer); + packstr(job_desc_ptr->comment, buffer); + pack16(job_desc_ptr->nice, buffer); + packstr(job_desc_ptr->qos, buffer); + + pack8(job_desc_ptr->open_mode, buffer); + pack8(job_desc_ptr->overcommit, buffer); + pack16(job_desc_ptr->acctg_freq, buffer); + pack32(job_desc_ptr->num_tasks, buffer); + pack16(job_desc_ptr->ckpt_interval, buffer); + + packstr(job_desc_ptr->req_nodes, buffer); + packstr(job_desc_ptr->exc_nodes, buffer); + packstr_array(job_desc_ptr->environment, job_desc_ptr->env_size, + buffer); + packstr_array(job_desc_ptr->spank_job_env, + job_desc_ptr->spank_job_env_size, buffer); + packstr(job_desc_ptr->script, buffer); + packstr_array(job_desc_ptr->argv, job_desc_ptr->argc, buffer); + + packstr(job_desc_ptr->std_err, buffer); + packstr(job_desc_ptr->std_in, buffer); + packstr(job_desc_ptr->std_out, buffer); + packstr(job_desc_ptr->work_dir, buffer); + packstr(job_desc_ptr->ckpt_dir, buffer); + + pack16(job_desc_ptr->immediate, buffer); + pack16(job_desc_ptr->requeue, buffer); + pack16(job_desc_ptr->shared, buffer); + pack16(job_desc_ptr->cpus_per_task, buffer); + pack16(job_desc_ptr->ntasks_per_node, buffer); + pack16(job_desc_ptr->ntasks_per_socket, buffer); + pack16(job_desc_ptr->ntasks_per_core, buffer); + + pack16(job_desc_ptr->plane_size, buffer); + pack16(job_desc_ptr->cpu_bind_type, buffer); + pack16(job_desc_ptr->mem_bind_type, buffer); + packstr(job_desc_ptr->cpu_bind, buffer); + packstr(job_desc_ptr->mem_bind, buffer); + + pack32(job_desc_ptr->time_limit, buffer); + pack32(job_desc_ptr->min_cpus, buffer); pack32(job_desc_ptr->min_nodes, buffer); pack32(job_desc_ptr->max_nodes, buffer); pack16(job_desc_ptr->min_sockets, buffer); @@ -3949,7 +4191,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, job_desc_msg_t *job_desc_ptr; /* alloc memory for structure */ - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { job_desc_ptr = xmalloc(sizeof(job_desc_msg_t)); *job_desc_buffer_ptr = job_desc_ptr; @@ -4029,7 +4271,134 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, &uint32_tmp, buffer); safe_unpack32(&job_desc_ptr->time_limit, buffer); - safe_unpack32(&job_desc_ptr->num_procs, buffer); + safe_unpack32(&job_desc_ptr->min_cpus, buffer); + safe_unpack32(&job_desc_ptr->max_cpus, buffer); + safe_unpack32(&job_desc_ptr->min_nodes, buffer); + safe_unpack32(&job_desc_ptr->max_nodes, buffer); + safe_unpack16(&job_desc_ptr->min_sockets, buffer); + safe_unpack16(&job_desc_ptr->min_cores, buffer); + safe_unpack16(&job_desc_ptr->min_threads, buffer); + safe_unpack32(&job_desc_ptr->user_id, buffer); + safe_unpack32(&job_desc_ptr->group_id, buffer); + + safe_unpack16(&job_desc_ptr->alloc_resp_port, buffer); + safe_unpack16(&job_desc_ptr->other_port, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->network, + &uint32_tmp, buffer); + safe_unpack_time(&job_desc_ptr->begin_time, buffer); + safe_unpack_time(&job_desc_ptr->end_time, buffer); + + safe_unpackstr_xmalloc(&job_desc_ptr->licenses, + &uint32_tmp, buffer); + safe_unpack16(&job_desc_ptr->mail_type, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->mail_user, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->reservation, + &uint32_tmp, buffer); + safe_unpack16(&job_desc_ptr->warn_signal, buffer); + safe_unpack16(&job_desc_ptr->warn_time, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->wckey, + &uint32_tmp, buffer); + + if (select_g_select_jobinfo_unpack( + &job_desc_ptr->select_jobinfo, + buffer, protocol_version)) + goto unpack_error; + + /* These are set so we don't confuse them later for what is + * set in the select_jobinfo structure. + */ +#ifdef HAVE_BG + job_desc_ptr->geometry[0] = (uint16_t)NO_VAL; +#endif + job_desc_ptr->conn_type = (uint16_t)NO_VAL; + job_desc_ptr->reboot = (uint16_t)NO_VAL; + job_desc_ptr->rotate = (uint16_t)NO_VAL; + job_desc_ptr->blrtsimage = NULL; + job_desc_ptr->linuximage = NULL; + job_desc_ptr->mloaderimage = NULL; + job_desc_ptr->ramdiskimage = NULL; + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + job_desc_ptr = xmalloc(sizeof(job_desc_msg_t)); + *job_desc_buffer_ptr = job_desc_ptr; + + /* load the data values */ + safe_unpack16(&job_desc_ptr->contiguous, buffer); + safe_unpack16(&job_desc_ptr->task_dist, buffer); + safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->features, + &uint32_tmp, buffer); + safe_unpack32(&job_desc_ptr->job_id, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->name, + &uint32_tmp, buffer); + + safe_unpackstr_xmalloc(&job_desc_ptr->alloc_node, + &uint32_tmp, buffer); + safe_unpack32(&job_desc_ptr->alloc_sid, buffer); + safe_unpack16(&job_desc_ptr->job_min_cpus, buffer); + safe_unpack32(&job_desc_ptr->job_min_memory, buffer); + safe_unpack32(&job_desc_ptr->job_min_tmp_disk, buffer); + + safe_unpackstr_xmalloc(&job_desc_ptr->partition, + &uint32_tmp, buffer); + safe_unpack32(&job_desc_ptr->priority, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->dependency, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->account, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->comment, + &uint32_tmp, buffer); + safe_unpack16(&job_desc_ptr->nice, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->qos, &uint32_tmp, buffer); + + safe_unpack8(&job_desc_ptr->open_mode, buffer); + safe_unpack8(&job_desc_ptr->overcommit, buffer); + safe_unpack16(&job_desc_ptr->acctg_freq, buffer); + safe_unpack32(&job_desc_ptr->num_tasks, buffer); + safe_unpack16(&job_desc_ptr->ckpt_interval, buffer); + + safe_unpackstr_xmalloc(&job_desc_ptr->req_nodes, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->exc_nodes, + &uint32_tmp, buffer); + safe_unpackstr_array(&job_desc_ptr->environment, + &job_desc_ptr->env_size, buffer); + safe_unpackstr_array(&job_desc_ptr->spank_job_env, + &job_desc_ptr->spank_job_env_size, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->script, + &uint32_tmp, buffer); + safe_unpackstr_array(&job_desc_ptr->argv, + &job_desc_ptr->argc, buffer); + + safe_unpackstr_xmalloc(&job_desc_ptr->std_err, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->std_in, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->std_out, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->work_dir, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->ckpt_dir, + &uint32_tmp, buffer); + + safe_unpack16(&job_desc_ptr->immediate, buffer); + safe_unpack16(&job_desc_ptr->requeue, buffer); + safe_unpack16(&job_desc_ptr->shared, buffer); + safe_unpack16(&job_desc_ptr->cpus_per_task, buffer); + safe_unpack16(&job_desc_ptr->ntasks_per_node, buffer); + safe_unpack16(&job_desc_ptr->ntasks_per_socket, buffer); + safe_unpack16(&job_desc_ptr->ntasks_per_core, buffer); + + safe_unpack16(&job_desc_ptr->plane_size, buffer); + safe_unpack16(&job_desc_ptr->cpu_bind_type, buffer); + safe_unpack16(&job_desc_ptr->mem_bind_type, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->cpu_bind, + &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&job_desc_ptr->mem_bind, + &uint32_tmp, buffer); + + safe_unpack32(&job_desc_ptr->time_limit, buffer); + safe_unpack32(&job_desc_ptr->min_cpus, buffer); safe_unpack32(&job_desc_ptr->min_nodes, buffer); safe_unpack32(&job_desc_ptr->max_nodes, buffer); safe_unpack16(&job_desc_ptr->min_sockets, buffer); @@ -4076,6 +4445,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, job_desc_ptr->mloaderimage = NULL; job_desc_ptr->ramdiskimage = NULL; } + return SLURM_SUCCESS; unpack_error: diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index d81150a1013eda11a21774cca10b768492424356..e3b6c902aad0c50428f3fec51697609e44360af7 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -444,11 +444,11 @@ extern Buf pack_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req) case DBD_ARCHIVE_LOAD: pack_acct_archive_rec(req->data, rpc_version, buffer); break; - case DBD_CLUSTER_PROCS: + case DBD_CLUSTER_CPUS: case DBD_FLUSH_JOBS: - slurmdbd_pack_cluster_procs_msg( + slurmdbd_pack_cluster_cpus_msg( rpc_version, - (dbd_cluster_procs_msg_t *)req->data, buffer); + (dbd_cluster_cpus_msg_t *)req->data, buffer); break; case DBD_GET_ACCOUNTS: case DBD_GET_ASSOCS: @@ -615,11 +615,11 @@ extern int unpack_slurmdbd_msg(uint16_t rpc_version, case DBD_ARCHIVE_LOAD: rc = unpack_acct_archive_rec(&resp->data, rpc_version, buffer); break; - case DBD_CLUSTER_PROCS: + case DBD_CLUSTER_CPUS: case DBD_FLUSH_JOBS: - rc = slurmdbd_unpack_cluster_procs_msg( + rc = slurmdbd_unpack_cluster_cpus_msg( rpc_version, - (dbd_cluster_procs_msg_t **)&resp->data, buffer); + (dbd_cluster_cpus_msg_t **)&resp->data, buffer); break; case DBD_GET_ACCOUNTS: case DBD_GET_ASSOCS: @@ -772,7 +772,7 @@ extern slurmdbd_msg_type_t str_2_slurmdbd_msg_type(char *msg_type) } else if(!strcasecmp(msg_type, "Add Users")) { return DBD_ADD_USERS; } else if(!strcasecmp(msg_type, "Cluster Processors")) { - return DBD_CLUSTER_PROCS; + return DBD_CLUSTER_CPUS; } else if(!strcasecmp(msg_type, "Flush Jobs")) { return DBD_FLUSH_JOBS; } else if(!strcasecmp(msg_type, "Get Accounts")) { @@ -949,9 +949,9 @@ extern char *slurmdbd_msg_type_2_str(slurmdbd_msg_type_t msg_type, int get_enum) } else return "Add Users"; break; - case DBD_CLUSTER_PROCS: + case DBD_CLUSTER_CPUS: if(get_enum) { - return "DBD_CLUSTER_PROCS"; + return "DBD_CLUSTER_CPUS"; } else return "Cluster Processors"; break; @@ -2084,8 +2084,8 @@ void inline slurmdbd_free_acct_coord_msg(uint16_t rpc_version, } } -void inline slurmdbd_free_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t *msg) +void inline slurmdbd_free_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t *msg) { if (msg) { xfree(msg->cluster_name); @@ -2434,29 +2434,29 @@ unpack_error: } void inline -slurmdbd_pack_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t *msg, Buf buffer) +slurmdbd_pack_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t *msg, Buf buffer) { if(rpc_version >= 5) { packstr(msg->cluster_name, buffer); packstr(msg->cluster_nodes, buffer); - pack32(msg->proc_count, buffer); + pack32(msg->cpu_count, buffer); pack_time(msg->event_time, buffer); } else { packstr(msg->cluster_name, buffer); - pack32(msg->proc_count, buffer); + pack32(msg->cpu_count, buffer); pack_time(msg->event_time, buffer); } } int inline -slurmdbd_unpack_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t **msg, Buf buffer) +slurmdbd_unpack_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t **msg, Buf buffer) { - dbd_cluster_procs_msg_t *msg_ptr; + dbd_cluster_cpus_msg_t *msg_ptr; uint32_t uint32_tmp; - msg_ptr = xmalloc(sizeof(dbd_cluster_procs_msg_t)); + msg_ptr = xmalloc(sizeof(dbd_cluster_cpus_msg_t)); *msg = msg_ptr; if(rpc_version >= 5) { @@ -2464,18 +2464,18 @@ slurmdbd_unpack_cluster_procs_msg(uint16_t rpc_version, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&msg_ptr->cluster_nodes, &uint32_tmp, buffer); - safe_unpack32(&msg_ptr->proc_count, buffer); + safe_unpack32(&msg_ptr->cpu_count, buffer); safe_unpack_time(&msg_ptr->event_time, buffer); } else { safe_unpackstr_xmalloc(&msg_ptr->cluster_name, &uint32_tmp, buffer); - safe_unpack32(&msg_ptr->proc_count, buffer); + safe_unpack32(&msg_ptr->cpu_count, buffer); safe_unpack_time(&msg_ptr->event_time, buffer); } return SLURM_SUCCESS; unpack_error: - slurmdbd_free_cluster_procs_msg(rpc_version, msg_ptr); + slurmdbd_free_cluster_cpus_msg(rpc_version, msg_ptr); *msg = NULL; return SLURM_ERROR; } @@ -3550,7 +3550,7 @@ slurmdbd_pack_step_complete_msg(uint16_t rpc_version, pack_time(msg->start_time, buffer); pack_time(msg->job_submit_time, buffer); pack32(msg->step_id, buffer); - pack32(msg->total_procs, buffer); + pack32(msg->total_cpus, buffer); } int inline @@ -3569,7 +3569,7 @@ slurmdbd_unpack_step_complete_msg(uint16_t rpc_version, safe_unpack_time(&msg_ptr->start_time, buffer); safe_unpack_time(&msg_ptr->job_submit_time, buffer); safe_unpack32(&msg_ptr->step_id, buffer); - safe_unpack32(&msg_ptr->total_procs, buffer); + safe_unpack32(&msg_ptr->total_cpus, buffer); return SLURM_SUCCESS; unpack_error: @@ -3594,7 +3594,7 @@ slurmdbd_pack_step_start_msg(uint16_t rpc_version, dbd_step_start_msg_t *msg, pack_time(msg->job_submit_time, buffer); pack32(msg->step_id, buffer); pack16(msg->task_dist, buffer); - pack32(msg->total_procs, buffer); + pack32(msg->total_cpus, buffer); pack32(msg->total_tasks, buffer); } else { pack32(msg->assoc_id, buffer); @@ -3605,7 +3605,7 @@ slurmdbd_pack_step_start_msg(uint16_t rpc_version, dbd_step_start_msg_t *msg, pack_time(msg->start_time, buffer); pack_time(msg->job_submit_time, buffer); pack32(msg->step_id, buffer); - pack32(msg->total_procs, buffer); + pack32(msg->total_cpus, buffer); } } @@ -3628,7 +3628,7 @@ slurmdbd_unpack_step_start_msg(uint16_t rpc_version, safe_unpack_time(&msg_ptr->job_submit_time, buffer); safe_unpack32(&msg_ptr->step_id, buffer); safe_unpack16(&msg_ptr->task_dist, buffer); - safe_unpack32(&msg_ptr->total_procs, buffer); + safe_unpack32(&msg_ptr->total_cpus, buffer); safe_unpack32(&msg_ptr->total_tasks, buffer); } else { safe_unpack32(&msg_ptr->assoc_id, buffer); @@ -3639,7 +3639,7 @@ slurmdbd_unpack_step_start_msg(uint16_t rpc_version, safe_unpack_time(&msg_ptr->start_time, buffer); safe_unpack_time(&msg_ptr->job_submit_time, buffer); safe_unpack32(&msg_ptr->step_id, buffer); - safe_unpack32(&msg_ptr->total_procs, buffer); + safe_unpack32(&msg_ptr->total_cpus, buffer); } return SLURM_SUCCESS; diff --git a/src/common/slurmdbd_defs.h b/src/common/slurmdbd_defs.h index 29aac0b8195fe77e649b0a1396fbf9e75c6be556..2c0111138642828b722dc658f35cc1b88545e166 100644 --- a/src/common/slurmdbd_defs.h +++ b/src/common/slurmdbd_defs.h @@ -89,7 +89,7 @@ typedef enum { DBD_ADD_ASSOCS, /* Add new association to the mix */ DBD_ADD_CLUSTERS, /* Add new cluster to the mix */ DBD_ADD_USERS, /* Add new user to the mix */ - DBD_CLUSTER_PROCS, /* Record total processors on cluster */ + DBD_CLUSTER_CPUS, /* Record total processors on cluster */ DBD_FLUSH_JOBS, /* End jobs that are still running * when a controller is restarted. */ DBD_GET_ACCOUNTS, /* Get account information */ @@ -177,17 +177,17 @@ typedef struct { acct_user_cond_t *cond; } dbd_acct_coord_msg_t; -typedef struct dbd_cluster_procs_msg { +typedef struct dbd_cluster_cpus_msg { char *cluster_name; /* name of cluster */ char *cluster_nodes; /* name of cluster */ - uint32_t proc_count; /* total processor count */ + uint32_t cpu_count; /* total processor count */ time_t event_time; /* time of transition */ -} dbd_cluster_procs_msg_t; +} dbd_cluster_cpus_msg_t; typedef struct { void *rec; /* this could be anything based on the type types - * are defined in slurm_accounting_storage.h - * *_rec_t */ + * are defined in slurm_accounting_storage.h + * *_rec_t */ } dbd_rec_msg_t; typedef struct { @@ -339,7 +339,7 @@ typedef struct dbd_step_comp_msg { time_t job_submit_time;/* job submit time needed to find job record * in db */ uint32_t step_id; /* step ID */ - uint32_t total_procs; /* count of allocated processors */ + uint32_t total_cpus; /* count of allocated processors */ uint32_t total_tasks; /* count of tasks for step */ } dbd_step_comp_msg_t; @@ -357,7 +357,7 @@ typedef struct dbd_step_start_msg { * in db */ uint32_t step_id; /* step ID */ uint16_t task_dist; /* layout method of step */ - uint32_t total_procs; /* count of allocated processors */ + uint32_t total_cpus; /* count of allocated processors */ uint32_t total_tasks; /* count of tasks for step */ } dbd_step_start_msg_t; @@ -419,8 +419,8 @@ extern char *slurmdbd_msg_type_2_str(slurmdbd_msg_type_t msg_type, \*****************************************************************************/ void inline slurmdbd_free_acct_coord_msg(uint16_t rpc_version, dbd_acct_coord_msg_t *msg); -void inline slurmdbd_free_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t *msg); +void inline slurmdbd_free_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t *msg); void inline slurmdbd_free_rec_msg(uint16_t rpc_version, slurmdbd_msg_type_t type, dbd_rec_msg_t *msg); @@ -438,7 +438,7 @@ void inline slurmdbd_free_job_complete_msg(uint16_t rpc_version, void inline slurmdbd_free_job_start_msg(uint16_t rpc_version, dbd_job_start_msg_t *msg); void inline slurmdbd_free_id_rc_msg(uint16_t rpc_version, - dbd_id_rc_msg_t *msg); + dbd_id_rc_msg_t *msg); void inline slurmdbd_free_job_suspend_msg(uint16_t rpc_version, dbd_job_suspend_msg_t *msg); void inline slurmdbd_free_list_msg(uint16_t rpc_version, @@ -468,9 +468,9 @@ void inline slurmdbd_free_usage_msg(uint16_t rpc_version, void inline slurmdbd_pack_acct_coord_msg(uint16_t rpc_version, dbd_acct_coord_msg_t *msg, Buf buffer); -void inline slurmdbd_pack_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t *msg, - Buf buffer); +void inline slurmdbd_pack_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t *msg, + Buf buffer); void inline slurmdbd_pack_rec_msg(uint16_t rpc_version, slurmdbd_msg_type_t type, dbd_rec_msg_t *msg, Buf buffer); @@ -528,9 +528,9 @@ void inline slurmdbd_pack_usage_msg(uint16_t rpc_version, int inline slurmdbd_unpack_acct_coord_msg(uint16_t rpc_version, dbd_acct_coord_msg_t **msg, Buf buffer); -int inline slurmdbd_unpack_cluster_procs_msg(uint16_t rpc_version, - dbd_cluster_procs_msg_t **msg, - Buf buffer); +int inline slurmdbd_unpack_cluster_cpus_msg(uint16_t rpc_version, + dbd_cluster_cpus_msg_t **msg, + Buf buffer); int inline slurmdbd_unpack_rec_msg(uint16_t rpc_version, slurmdbd_msg_type_t type, dbd_rec_msg_t **msg, Buf buffer); diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c index c1e4372354ce9e7be3cf8d4abf6fe4d78ce45ec6..eeb0395cde6632dc31ee6f9129094fadca7b2110 100644 --- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c +++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c @@ -508,10 +508,10 @@ extern int clusteracct_storage_p_register_ctld(void *db_conn, return SLURM_SUCCESS; } -extern int clusteracct_storage_p_cluster_procs(void *db_conn, +extern int clusteracct_storage_p_cluster_cpus(void *db_conn, char *cluster, char *cluster_nodes, - uint32_t procs, + uint32_t cpus, time_t event_time) { return SLURM_SUCCESS; @@ -584,7 +584,7 @@ extern int jobacct_storage_p_job_start(void *db_conn, char *cluster_name, snprintf(buf, BUFFER_SIZE, "%d %s %d %ld %u %s %s", JOB_START, jname, - track_steps, priority, job_ptr->total_procs, + track_steps, priority, job_ptr->total_cpus, nodes, account); rc = _print_record(job_ptr, job_ptr->start_time, buf); @@ -642,7 +642,10 @@ extern int jobacct_storage_p_step_start(void *db_conn, } #ifdef HAVE_BG - cpus = step_ptr->job_ptr->num_procs; + if(step_ptr->job_ptr->details) + cpus = step_ptr->job_ptr->details->min_cpus; + else + cpus = step_ptr->job_ptr->cpu_cnt; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -656,7 +659,7 @@ extern int jobacct_storage_p_step_start(void *db_conn, #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); } else { cpus = step_ptr->step_layout->task_cnt; @@ -773,7 +776,10 @@ extern int jobacct_storage_p_step_complete(void *db_conn, comp_status = JOB_COMPLETE; #ifdef HAVE_BG - cpus = step_ptr->job_ptr->num_procs; + if(step_ptr->job_ptr->details) + cpus = step_ptr->job_ptr->details->min_cpus; + else + cpus = step_ptr->job_ptr->cpu_cnt; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -787,7 +793,7 @@ extern int jobacct_storage_p_step_complete(void *db_conn, #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); } else { diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 7dbd24f0d7ed4ec6ccf70b5c972a5f42861a4a67..f12ef7a1f61f648e98b08e1a867da13edeecdfa8 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -10479,10 +10479,10 @@ extern int clusteracct_storage_p_register_ctld(mysql_conn_t *mysql_conn, return mysql_db_query(mysql_conn->db_conn, query); } -extern int clusteracct_storage_p_cluster_procs(mysql_conn_t *mysql_conn, +extern int clusteracct_storage_p_cluster_cpus(mysql_conn_t *mysql_conn, char *cluster, char *cluster_nodes, - uint32_t procs, + uint32_t cpus, time_t event_time) { char* query; @@ -10523,8 +10523,8 @@ extern int clusteracct_storage_p_cluster_procs(mysql_conn_t *mysql_conn, goto add_it; } - if(atoi(row[0]) == procs) { - debug3("we have the same procs as before no need to " + if(atoi(row[0]) == cpus) { + debug3("we have the same cpu count as before no need to " "update the database."); if(cluster_nodes) { if(!row[1][0]) { @@ -10549,9 +10549,9 @@ extern int clusteracct_storage_p_cluster_procs(mysql_conn_t *mysql_conn, goto end_it; } else debug("%s has changed from %s cpus to %u", - cluster, row[0], procs); + cluster, row[0], cpus); - /* reset all the entries for this cluster since the procs + /* reset all the entries for this cluster since the cpus changed some of the downed nodes may have gone away. Request them again with ACCOUNTING_FIRST_REG */ query = xstrdup_printf( @@ -10568,7 +10568,7 @@ add_it: "insert into %s (cluster, cluster_nodes, cpu_count, " "period_start, reason) " "values (\"%s\", \"%s\", %u, %d, 'Cluster processor count')", - event_table, cluster, cluster_nodes, procs, event_time); + event_table, cluster, cluster_nodes, cpus, event_time); rc = mysql_db_query(mysql_conn->db_conn, query); xfree(query); end_it: @@ -10888,8 +10888,8 @@ no_rollup_change: (int)job_ptr->start_time, jname, track_steps, job_ptr->job_state & JOB_STATE_BASE, - job_ptr->priority, job_ptr->num_procs, - job_ptr->total_procs, node_cnt, + job_ptr->priority, job_ptr->details->min_cpus, + job_ptr->total_cpus, node_cnt, job_ptr->job_state & JOB_STATE_BASE, job_ptr->assoc_id, wckeyid, job_ptr->resv_id, job_ptr->time_limit); @@ -10945,7 +10945,7 @@ no_rollup_change: "where id=%d", (int)job_ptr->start_time, jname, job_ptr->job_state & JOB_STATE_BASE, - job_ptr->total_procs, node_cnt, + job_ptr->total_cpus, node_cnt, job_ptr->assoc_id, wckeyid, job_ptr->resv_id, job_ptr->time_limit, job_ptr->db_index); @@ -11084,7 +11084,7 @@ extern int jobacct_storage_p_step_start(mysql_conn_t *mysql_conn, step_ptr->step_node_bitmap); } #ifdef HAVE_BG - tasks = cpus = step_ptr->job_ptr->num_procs; + tasks = cpus = step_ptr->job_ptr->details->min_cpus; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -11100,7 +11100,7 @@ extern int jobacct_storage_p_step_start(mysql_conn_t *mysql_conn, &nodes); #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - tasks = cpus = step_ptr->job_ptr->total_procs; + tasks = cpus = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); nodes = step_ptr->job_ptr->node_cnt; @@ -11199,11 +11199,11 @@ extern int jobacct_storage_p_step_complete(mysql_conn_t *mysql_conn, } else { now = time(NULL); #ifdef HAVE_BG - tasks = cpus = step_ptr->job_ptr->num_procs; + tasks = cpus = step_ptr->job_ptr->details->min_cpus; #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) - tasks = cpus = step_ptr->job_ptr->total_procs; + tasks = cpus = step_ptr->job_ptr->total_cpus; else { cpus = step_ptr->cpu_count; tasks = step_ptr->step_layout->task_cnt; diff --git a/src/plugins/accounting_storage/none/accounting_storage_none.c b/src/plugins/accounting_storage/none/accounting_storage_none.c index 09c7b2f347417298062ec119ddc1e982f4b638b5..cc1ee01f5a8fecaddb5984a641208b266980a40f 100644 --- a/src/plugins/accounting_storage/none/accounting_storage_none.c +++ b/src/plugins/accounting_storage/none/accounting_storage_none.c @@ -354,10 +354,10 @@ extern int clusteracct_storage_p_register_ctld(void *db_conn, return SLURM_SUCCESS; } -extern int clusteracct_storage_p_cluster_procs(void *db_conn, +extern int clusteracct_storage_p_cluster_cpus(void *db_conn, char *cluster, char *cluster_nodes, - uint32_t procs, + uint32_t cpus, time_t event_time) { return SLURM_SUCCESS; diff --git a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c index 537b7ee95094200b56cc6fbb8a2b07f9ea8e43e3..c6e89b50d34c754590c1bbeacfe60adcff60dede 100644 --- a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c +++ b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c @@ -1117,31 +1117,31 @@ extern int clusteracct_storage_p_register_ctld(PGconn *acct_pgsql_db, return SLURM_SUCCESS; } -extern int clusteracct_storage_p_cluster_procs(PGconn *acct_pgsql_db, +extern int clusteracct_storage_p_cluster_cpus(PGconn *acct_pgsql_db, char *cluster, char *cluster_nodes, - uint32_t procs, + uint32_t cpus, time_t event_time) { #ifdef HAVE_PGSQL - static uint32_t last_procs = -1; + static uint32_t last_cpus = -1; char* query; int rc = SLURM_SUCCESS; PGresult *result = NULL; - int got_procs = 0; + int got_cpus = 0; - if (procs == last_procs) { - debug3("we have the same procs as before no need to " + if (cpus == last_cpus) { + debug3("we have the same cpus as before no need to " "update the database."); return SLURM_SUCCESS; } - last_procs = procs; + last_cpus = cpus; /* Record the processor count */ #if _DEBUG slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); - info("cluster_acct_procs: %s has %u total CPUs at %s", - cluster, procs, tmp_buff); + info("cluster_acct_cpus: %s has %u total CPUs at %s", + cluster, cpus, tmp_buff); #endif query = xstrdup_printf( "select cpu_count from %s where cluster='%s' " @@ -1159,12 +1159,12 @@ extern int clusteracct_storage_p_cluster_procs(PGconn *acct_pgsql_db, "most likely a first time running.", cluster); goto add_it; } - got_procs = atoi(PQgetvalue(result, 0, 0)); - if(got_procs == procs) { + got_cpus = atoi(PQgetvalue(result, 0, 0)); + if(got_cpus == cpus) { debug("%s hasn't changed since last entry", cluster); goto end_it; } - debug("%s has changed from %d cpus to %u", cluster, got_procs, procs); + debug("%s has changed from %d cpus to %u", cluster, got_cpus, cpus); query = xstrdup_printf( "update %s set period_end=%u where cluster='%s' " @@ -1178,7 +1178,7 @@ add_it: query = xstrdup_printf( "insert into %s (cluster, cpu_count, period_start, reason) " "values ('%s', %u, %d, 'Cluster processor count')", - event_table, cluster, procs, event_time); + event_table, cluster, cpus, event_time); rc = pgsql_db_query(acct_pgsql_db, query); xfree(query); @@ -1295,8 +1295,8 @@ extern int jobacct_storage_p_job_start(PGconn *acct_pgsql_db, (int)job_ptr->start_time, jname, track_steps, job_ptr->job_state & JOB_STATE_BASE, - priority, job_ptr->num_procs, - job_ptr->total_procs); + priority, job_ptr->details->min_cpus, + job_ptr->total_cpus); try_again: if(!(job_ptr->db_index = pgsql_insert_ret_id(acct_pgsql_db, "job_table_id_seq", @@ -1332,7 +1332,7 @@ extern int jobacct_storage_p_job_start(PGconn *acct_pgsql_db, "alloc_cpus=%u, associd=%d where id=%d", (int)job_ptr->start_time, jname, job_ptr->job_state & JOB_STATE_BASE, - job_ptr->total_procs, job_ptr->assoc_id, + job_ptr->total_cpus, job_ptr->assoc_id, job_ptr->db_index); rc = pgsql_db_query(acct_pgsql_db, query); } @@ -1445,12 +1445,12 @@ extern int jobacct_storage_p_step_start(PGconn *acct_pgsql_db, } if(slurmdbd_conf) { - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); } else { #ifdef HAVE_BG - cpus = step_ptr->job_ptr->num_procs; + cpus = step_ptr->job_ptr->details->min_cpus; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -1464,7 +1464,7 @@ extern int jobacct_storage_p_step_start(PGconn *acct_pgsql_db, #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); } else { @@ -1544,16 +1544,16 @@ extern int jobacct_storage_p_step_complete(PGconn *acct_pgsql_db, if(slurmdbd_conf) { now = step_ptr->job_ptr->end_time; - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; } else { now = time(NULL); #ifdef HAVE_BG - cpus = step_ptr->job_ptr->num_procs; + cpus = step_ptr->job_ptr->details->min_cpus; #else if(!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) - cpus = step_ptr->job_ptr->total_procs; + cpus = step_ptr->job_ptr->total_cpus; else cpus = step_ptr->step_layout->task_cnt; #endif diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index 84e1839ff1a3548e09fe1f5f30ac377b0efdbfeb..24a812031c94f17dd7b39deaf546b43589686761 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -1530,23 +1530,23 @@ extern int clusteracct_storage_p_node_up(void *db_conn, return SLURM_SUCCESS; } -extern int clusteracct_storage_p_cluster_procs(void *db_conn, +extern int clusteracct_storage_p_cluster_cpus(void *db_conn, char *cluster, char *cluster_nodes, - uint32_t procs, + uint32_t cpus, time_t event_time) { slurmdbd_msg_t msg; - dbd_cluster_procs_msg_t req; + dbd_cluster_cpus_msg_t req; int rc = SLURM_ERROR; debug2("Sending info for cluster %s", cluster); - memset(&req, 0, sizeof(dbd_cluster_procs_msg_t)); + memset(&req, 0, sizeof(dbd_cluster_cpus_msg_t)); req.cluster_name = cluster; req.cluster_nodes = cluster_nodes; - req.proc_count = procs; + req.cpu_count = cpus; req.event_time = event_time; - msg.msg_type = DBD_CLUSTER_PROCS; + msg.msg_type = DBD_CLUSTER_CPUS; msg.data = &req; slurm_send_slurmdbd_recv_rc_msg(SLURMDBD_VERSION, &msg, &rc); @@ -1643,7 +1643,7 @@ extern int jobacct_storage_p_job_start(void *db_conn, char *cluster_name, } memset(&req, 0, sizeof(dbd_job_start_msg_t)); - req.alloc_cpus = job_ptr->total_procs; + req.alloc_cpus = job_ptr->total_cpus; req.cluster = cluster_name; req.account = job_ptr->account; req.assoc_id = job_ptr->assoc_id; @@ -1672,7 +1672,8 @@ extern int jobacct_storage_p_job_start(void *db_conn, char *cluster_name, } req.partition = job_ptr->partition; - req.req_cpus = job_ptr->num_procs; + if (job_ptr->details) + req.req_cpus = job_ptr->details->min_cpus; req.resv_id = job_ptr->resv_id; req.priority = job_ptr->priority; req.start_time = job_ptr->start_time; @@ -1775,7 +1776,10 @@ extern int jobacct_storage_p_step_start(void *db_conn, #ifdef HAVE_BG char *ionodes = NULL; - cpus = tasks = step_ptr->job_ptr->num_procs; + if(step_ptr->job_ptr->details) + cpus = step_ptr->job_ptr->details->min_cpus; + else + cpus = step_ptr->job_ptr->cpu_cnt; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -1792,7 +1796,7 @@ extern int jobacct_storage_p_step_start(void *db_conn, &nodes); #else if (!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - cpus = tasks = step_ptr->job_ptr->total_procs; + cpus = tasks = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); nodes = step_ptr->job_ptr->node_cnt; @@ -1832,7 +1836,7 @@ extern int jobacct_storage_p_step_start(void *db_conn, if (step_ptr->step_layout) req.task_dist = step_ptr->step_layout->task_dist; req.task_dist = task_dist; - req.total_procs = cpus; + req.total_cpus = cpus; req.total_tasks = tasks; msg.msg_type = DBD_STEP_START; @@ -1858,7 +1862,10 @@ extern int jobacct_storage_p_step_complete(void *db_conn, #ifdef HAVE_BG char *ionodes = NULL; - cpus = tasks = step_ptr->job_ptr->num_procs; + if(step_ptr->job_ptr->details) + cpus = step_ptr->job_ptr->details->min_cpus; + else + cpus = step_ptr->job_ptr->cpu_cnt; select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); @@ -1873,7 +1880,7 @@ extern int jobacct_storage_p_step_complete(void *db_conn, #else if (!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) { - cpus = tasks = step_ptr->job_ptr->total_procs; + cpus = tasks = step_ptr->job_ptr->total_cpus; snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes); } else { @@ -1905,7 +1912,7 @@ extern int jobacct_storage_p_step_complete(void *db_conn, if (step_ptr->job_ptr->details) req.job_submit_time = step_ptr->job_ptr->details->submit_time; req.step_id = step_ptr->step_id; - req.total_procs = cpus; + req.total_cpus = cpus; req.total_tasks = tasks; msg.msg_type = DBD_STEP_COMPLETE; @@ -2071,15 +2078,15 @@ extern int acct_storage_p_flush_jobs_on_cluster(void *db_conn, char *cluster, time_t event_time) { slurmdbd_msg_t msg; - dbd_cluster_procs_msg_t req; + dbd_cluster_cpus_msg_t req; info("Ending any jobs in accounting that were running when controller " "went down on cluster %s", cluster); - memset(&req, 0, sizeof(dbd_cluster_procs_msg_t)); + memset(&req, 0, sizeof(dbd_cluster_cpus_msg_t)); req.cluster_name = cluster; - req.proc_count = 0; + req.cpu_count = 0; req.event_time = event_time; msg.msg_type = DBD_FLUSH_JOBS; diff --git a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c index 5e2127c6ddfcd4c649a82b4f4b32c80ef130d6e3..1d8e3188a31d178df4fb4dd6aa45c4f15d1694ba 100644 --- a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c +++ b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c @@ -288,7 +288,7 @@ extern int slurm_jobcomp_log_record ( struct job_record *job_ptr ) job_state_string(job_state), job_ptr->partition, lim_str, start_str, end_str, job_ptr->nodes, job_ptr->node_cnt, - job_ptr->total_procs, work_dir, + job_ptr->total_cpus, work_dir, select_buf); tot_size = strlen(job_rec); diff --git a/src/plugins/jobcomp/mysql/jobcomp_mysql.c b/src/plugins/jobcomp/mysql/jobcomp_mysql.c index 2e91823e5567b9261c1530823bb9a8b828e528cc..40801884d73e3327bea0b73bbed3868dbdc9c912 100644 --- a/src/plugins/jobcomp/mysql/jobcomp_mysql.c +++ b/src/plugins/jobcomp/mysql/jobcomp_mysql.c @@ -358,7 +358,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr) "'%s', \"%s\", %u, %u, %u", job_ptr->job_id, job_ptr->user_id, usr_str, job_ptr->group_id, grp_str, job_ptr->name, - job_state, job_ptr->total_procs, job_ptr->partition, lim_str, + job_state, job_ptr->total_cpus, job_ptr->partition, lim_str, (int)job_ptr->start_time, (int)job_ptr->end_time, job_ptr->node_cnt); diff --git a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c index b7c8cedad580e24710fa419eda63bf281116db65..7b0edf915cf24700d0c1ef429a7acd4708d0ae3e 100644 --- a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c +++ b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c @@ -381,7 +381,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr) "'%s', \"%s\", %u, %u, %u", job_ptr->job_id, job_ptr->user_id, usr_str, job_ptr->group_id, grp_str, job_ptr->name, - job_state, job_ptr->total_procs, job_ptr->partition, lim_str, + job_state, job_ptr->total_cpus, job_ptr->partition, lim_str, (int)job_ptr->start_time, (int)job_ptr->end_time, job_ptr->node_cnt); diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c index dc1ad54820b5236cffe045d93e44c79287334e17..a30a4d4d49bb6250a64d01db70c24df80772485c 100644 --- a/src/plugins/jobcomp/script/jobcomp_script.c +++ b/src/plugins/jobcomp/script/jobcomp_script.c @@ -220,7 +220,7 @@ static struct jobcomp_info * _jobcomp_info_create (struct job_record *job) j->submit = job->details ? job->details->submit_time:job->start_time; j->batch_flag = job->batch_flag; j->nodes = xstrdup (job->nodes); - j->nprocs = job->total_procs; + j->nprocs = job->total_cpus; j->nnodes = job->node_cnt; j->account = job->account ? xstrdup (job->account) : NULL; if (job->details && job->details->work_dir) diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 1e114f52018b39071f5733a68d8a4f40bb64864b..e4a6a9b2ee93bd90bf1365066664b726e367a9a0 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -438,7 +438,7 @@ static void _get_priority_factors(time_t start_time, struct job_record *job_ptr, * the job is requesting smaller than 1 node. * We need a way to figure out how to look at * cpus requested here for those situations. This can - * probably be done with the num_procs, but + * probably be done with the num_cpus, but * that isn't always used. This is usually * set on bluegene systems, which is where * this problem arose. The code below was @@ -452,9 +452,9 @@ static void _get_priority_factors(time_t start_time, struct job_record *job_ptr, factors->priority_js = (double)(node_record_count - job_ptr->details->min_nodes) / (double)node_record_count; -/* if(job_ptr->num_procs && job_ptr->num_procs != NO_VAL) { */ +/* if(job_ptr->num_cpus && job_ptr->num_cpus != NO_VAL) { */ /* factors->priority_js += */ -/* (double)(total_cpus - job_ptr->num_procs) */ +/* (double)(total_cpus - job_ptr->num_cpus) */ /* / (double)total_cpus; */ /* factors->priority_js /= 2; */ /* } */ @@ -462,9 +462,9 @@ static void _get_priority_factors(time_t start_time, struct job_record *job_ptr, factors->priority_js = (double)job_ptr->details->min_nodes / (double)node_record_count; -/* if(job_ptr->num_procs && job_ptr->num_procs != NO_VAL) { */ +/* if(job_ptr->num_cpus && job_ptr->num_cpus != NO_VAL) { */ /* factors->priority_js += */ -/* (double)job_ptr->num_procs */ +/* (double)job_ptr->num_cpus */ /* / (double)total_cpus; */ /* factors->priority_js /= 2; */ /* } */ @@ -764,7 +764,7 @@ static void *_decay_thread(void *no_data) * pow(decay_factor, (double)run_delta); real_decay = run_decay - * (double)job_ptr->total_procs; + * (double)job_ptr->total_cpus; /* now apply the usage factor for this qos */ @@ -941,11 +941,11 @@ int init ( void ) calc_fairshare = 0; weight_fs = 0; } else if(assoc_mgr_root_assoc) { - if(!cluster_procs) + if(!cluster_cpus) fatal("We need to have a cluster cpu count " "before we can init the priority/multifactor " "plugin"); - priority_p_set_max_cluster_usage(cluster_procs, + priority_p_set_max_cluster_usage(cluster_cpus, slurm_get_priority_decay_hl()); slurm_attr_init(&thread_attr); if (pthread_create(&decay_handler_thread, &thread_attr, diff --git a/src/plugins/sched/wiki/get_jobs.c b/src/plugins/sched/wiki/get_jobs.c index e9f785c0bd6408c348cc4723f049782e192396b3..de356a3756df2ece2cd93a98d3a024b83451c7b5 100644 --- a/src/plugins/sched/wiki/get_jobs.c +++ b/src/plugins/sched/wiki/get_jobs.c @@ -439,10 +439,10 @@ static uint32_t _get_job_tasks(struct job_record *job_ptr) uint32_t task_cnt; if (IS_JOB_STARTED(job_ptr)) { - task_cnt = job_ptr->total_procs; + task_cnt = job_ptr->total_cpus; } else { - if (job_ptr->num_procs) - task_cnt = job_ptr->num_procs; + if (job_ptr->details && job_ptr->details->min_cpus) + task_cnt = job_ptr->details->min_cpus; else task_cnt = 1; if (job_ptr->details) { diff --git a/src/plugins/sched/wiki/start_job.c b/src/plugins/sched/wiki/start_job.c index 38b6d9c8d7a7d407ac9be248a44f7399b66b4329..13dca87f6d4c08247f3b12f05365055d9f328445 100644 --- a/src/plugins/sched/wiki/start_job.c +++ b/src/plugins/sched/wiki/start_job.c @@ -248,8 +248,8 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; - old_task_cnt = job_ptr->num_procs; - job_ptr->num_procs = MAX(task_cnt, old_task_cnt); + old_task_cnt = job_ptr->details->min_cpus; + job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); @@ -290,7 +290,7 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, /* restore some of job state */ job_ptr->priority = 0; - job_ptr->num_procs = old_task_cnt; + job_ptr->details->min_cpus = old_task_cnt; rc = -1; } diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index 84212807f9b2af09791f82ee584c6b7c4078e766..6b35afac3ee3e51e6cea0b66a47f07a002e30ca4 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -549,10 +549,10 @@ static uint32_t _get_job_tasks(struct job_record *job_ptr) uint32_t task_cnt; if (IS_JOB_STARTED(job_ptr)) { - task_cnt = job_ptr->total_procs; + task_cnt = job_ptr->total_cpus; } else { - if (job_ptr->num_procs) - task_cnt = job_ptr->num_procs; + if (job_ptr->details && job_ptr->details->min_cpus) + task_cnt = job_ptr->details->min_cpus; else task_cnt = 1; if (job_ptr->details) { diff --git a/src/plugins/sched/wiki2/job_modify.c b/src/plugins/sched/wiki2/job_modify.c index 44d6aabcfca6ad24412719b44eea7eb1140f4e8e..b236cbdfb0f57719c78cd465a318f2a3d1143435 100644 --- a/src/plugins/sched/wiki2/job_modify.c +++ b/src/plugins/sched/wiki2/job_modify.c @@ -336,7 +336,8 @@ host_fini: if (rc) { info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG - job_ptr->num_procs = job_desc.num_procs; + job_ptr->details->min_cpus = job_desc.min_cpus; + job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->job_min_cpus = job_desc.job_min_cpus; #endif last_job_update = now; diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c index 62ad4fe6e39829df5fade6c489691b041ea8df83..29fc7934680434b22fe1ea16b6260456a79bcec8 100644 --- a/src/plugins/sched/wiki2/job_will_run.c +++ b/src/plugins/sched/wiki2/job_will_run.c @@ -270,7 +270,7 @@ static char * _will_run_test(uint32_t jobid, time_t start_time, &proc_cnt); #else - proc_cnt = job_ptr->total_procs; + proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,", jobid, proc_cnt, (uint32_t) job_ptr->start_time); @@ -558,7 +558,7 @@ static char * _will_run_test2(uint32_t jobid, time_t start_time, select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else - proc_cnt = job_ptr->total_procs; + proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=", diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c index 352eb3273560548616b867c756684cf8740bba71..3ac058165163f5dfad71be6ff1a79b972f111d17 100644 --- a/src/plugins/sched/wiki2/start_job.c +++ b/src/plugins/sched/wiki2/start_job.c @@ -302,8 +302,8 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; - old_task_cnt = job_ptr->num_procs; - job_ptr->num_procs = MAX(task_cnt, old_task_cnt); + old_task_cnt = job_ptr->details->min_cpus; + job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); @@ -345,7 +345,7 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, /* restore some of job state */ job_ptr->priority = 0; - job_ptr->num_procs = old_task_cnt; + job_ptr->details->min_cpus = old_task_cnt; rc = -1; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index be35075bd3e2786deffa4008411e325bba68bdb5..963504712968edd33cac29d989f0adc2e7149336 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -833,7 +833,7 @@ static int _find_best_block_match(List block_list, bg_record_t *bg_record = NULL; uint16_t req_geometry[BA_SYSTEM_DIMENSIONS]; uint16_t conn_type, rotate, target_size = 0; - uint32_t req_procs = job_ptr->num_procs; + uint32_t req_procs = job_ptr->details->min_cpus; ba_request_t request; int i; int overlap_check = 0; @@ -1248,12 +1248,12 @@ static int _sync_block_lists(List full_list, List incomp_list) /* job_resrcs_ptr->node_bitmap = bit_copy(bitmap); */ /* if (job_resrcs_ptr->node_bitmap == NULL) */ /* fatal("bit_copy malloc failure"); */ -/* job_resrcs_ptr->nprocs = job_ptr->num_procs; */ +/* job_resrcs_ptr->nprocs = job_ptr->num_cpus; */ /* if (build_job_resources(job_resrcs_ptr, (void *)node_record_table_ptr, 1)) */ /* error("select_p_job_test: build_job_resources: %m"); */ -/* if (job_ptr->num_procs <= bg_conf->cpus_per_bp) */ -/* node_cpus = job_ptr->num_procs; */ +/* if (job_ptr->num_cpus <= bg_conf->cpus_per_bp) */ +/* node_cpus = job_ptr->num_cpus; */ /* else */ /* node_cpus = bg_conf->cpus_per_bp; */ @@ -1310,7 +1310,7 @@ static void _build_select_struct(struct job_record *job_ptr, job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt); /* job_resrcs_ptr->nhosts = node_cnt; */ job_resrcs_ptr->nhosts = bit_set_count(bitmap); - job_resrcs_ptr->nprocs = job_ptr->num_procs; + job_resrcs_ptr->nprocs = job_ptr->details->min_cpus; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); @@ -1470,7 +1470,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, conn_type = SELECT_SMALL; else if(min_nodes > 1) conn_type = SELECT_TORUS; - else if(job_ptr->num_procs < bg_conf->cpus_per_bp) + else if(job_ptr->details->min_cpus < bg_conf->cpus_per_bp) conn_type = SELECT_SMALL; select_g_select_jobinfo_set(job_ptr->select_jobinfo, @@ -1490,7 +1490,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, buf, sizeof(buf), SELECT_PRINT_MIXED); - debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u", + debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u", job_ptr->job_id, local_mode, buf, min_nodes, req_nodes, max_nodes); select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index d227e81bc20cfa82f9b2cf65478fb565e5b64dcf..6539e35e725d34c84f521b103501131a94555df5 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -501,8 +501,8 @@ static void _sync_agent(bg_update_t *bg_update_ptr) } last_bg_update = time(NULL); - bg_update_ptr->job_ptr->num_procs = bg_record->cpu_cnt; - bg_update_ptr->job_ptr->total_procs = bg_update_ptr->job_ptr->num_procs; + bg_update_ptr->job_ptr->total_cpus = + bg_update_ptr->job_ptr->details->min_cpus = bg_record->cpu_cnt; bg_record->job_running = bg_update_ptr->job_ptr->job_id; bg_record->job_ptr = bg_update_ptr->job_ptr; @@ -1252,8 +1252,7 @@ extern int start_job(struct job_record *job_ptr) } last_bg_update = time(NULL); - job_ptr->num_procs = bg_record->cpu_cnt; - job_ptr->total_procs = job_ptr->num_procs; + job_ptr->total_cpus = job_ptr->details->min_cpus = bg_record->cpu_cnt; bg_record->job_running = bg_update_ptr->job_ptr->job_id; bg_record->job_ptr = bg_update_ptr->job_ptr; if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) { diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c index f95527d141c84ae628c5d1ca0dcd899961884ad5..019dea70464432cdff8de3058df3beb58ae0daf3 100644 --- a/src/plugins/select/bluegene/plugin/select_bluegene.c +++ b/src/plugins/select/bluegene/plugin/select_bluegene.c @@ -1163,16 +1163,16 @@ extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) job_desc->max_nodes = job_desc->min_nodes; } - /* make sure if the user only specified num_procs to + /* make sure if the user only specified min_cpus to set min_nodes correctly */ - if((job_desc->num_procs != NO_VAL) - && (job_desc->num_procs > job_desc->min_nodes)) + if((job_desc->min_cpus != NO_VAL) + && (job_desc->min_cpus > job_desc->min_nodes)) job_desc->min_nodes = - job_desc->num_procs / bg_conf->cpu_ratio; + job_desc->min_cpus / bg_conf->cpu_ratio; - /* initialize num_procs to the min_nodes */ - job_desc->num_procs = job_desc->min_nodes * bg_conf->cpu_ratio; + /* initialize min_cpus to the min_nodes */ + job_desc->min_cpus = job_desc->min_nodes * bg_conf->cpu_ratio; if((job_desc->max_nodes == (uint32_t) NO_VAL) || (job_desc->max_nodes < job_desc->min_nodes)) @@ -1198,7 +1198,7 @@ extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) SELECT_JOBDATA_NODE_CNT, &job_desc->min_nodes); job_desc->min_nodes = tmp; - job_desc->num_procs = bg_conf->cpus_per_bp * tmp; + job_desc->min_cpus = bg_conf->cpus_per_bp * tmp; } else { #ifdef HAVE_BGL if(job_desc->min_nodes <= bg_conf->nodecard_node_cnt @@ -1219,7 +1219,7 @@ extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) tmp = bg_conf->bp_node_cnt/job_desc->min_nodes; - job_desc->num_procs = bg_conf->cpus_per_bp/tmp; + job_desc->min_cpus = bg_conf->cpus_per_bp/tmp; job_desc->min_nodes = 1; #else i = bg_conf->smallest_block; @@ -1235,12 +1235,12 @@ extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) SELECT_JOBDATA_NODE_CNT, &job_desc->min_nodes); - job_desc->num_procs = job_desc->min_nodes + job_desc->min_cpus = job_desc->min_nodes * bg_conf->cpu_ratio; job_desc->min_nodes = 1; #endif } - //job_desc->job_min_cpus = job_desc->num_procs; + //job_desc->job_min_cpus = job_desc->min_cpus; if(job_desc->max_nodes > bg_conf->bp_node_cnt) { tmp = job_desc->max_nodes % bg_conf->bp_node_cnt; diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index f6c396f0915115e3ac37e51e0e80abae70f8e9c3..7a47da58a1918edaeae16fef8bb980e56ddbc028 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -926,7 +926,7 @@ static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map, consec_cpus[consec_index] = consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ - rem_cpus = job_ptr->num_procs; + rem_cpus = job_ptr->details->min_cpus; rem_nodes = MAX(min_nodes, req_nodes); i = 0; @@ -1157,7 +1157,7 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, int best_fit_location = 0, best_fit_sufficient; bool sufficient; - rem_cpus = job_ptr->num_procs; + rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) rem_nodes = req_nodes; else @@ -1454,10 +1454,11 @@ static int _choose_nodes(struct job_record *job_ptr, bitstr_t *node_map, } } - /* NOTE: num_procs is 1 by default, + /* NOTE: details->min_cpus is 1 by default, * Only reset max_nodes if user explicitly sets a proc count */ - if ((job_ptr->num_procs > 1) && (max_nodes > job_ptr->num_procs)) - max_nodes = job_ptr->num_procs; + if ((job_ptr->details->min_cpus > 1) + && (max_nodes > job_ptr->details->min_cpus)) + max_nodes = job_ptr->details->min_cpus; origmap = bit_copy(node_map); if (origmap == NULL) @@ -1664,11 +1665,11 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, } /* This is the case if -O/--overcommit is true */ - if (job_ptr->num_procs == job_ptr->details->min_nodes) { + if (job_ptr->details->min_cpus == job_ptr->details->min_nodes) { struct multi_core_data *mc_ptr = job_ptr->details->mc_ptr; - job_ptr->num_procs *= MAX(1, mc_ptr->min_threads); - job_ptr->num_procs *= MAX(1, mc_ptr->min_cores); - job_ptr->num_procs *= MAX(1, mc_ptr->min_sockets); + job_ptr->details->min_cpus *= MAX(1, mc_ptr->min_threads); + job_ptr->details->min_cpus *= MAX(1, mc_ptr->min_cores); + job_ptr->details->min_cpus *= MAX(1, mc_ptr->min_sockets); } debug3("cons_res: cr_job_test: evaluating job %u on %u nodes", @@ -1966,7 +1967,7 @@ alloc_job: if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) { /* Set a reasonable value for the number of allocated CPUs. * Without computing task distribution this is only a guess */ - job_ptr->total_procs = MAX(job_ptr->num_procs, + job_ptr->total_cpus = MAX(job_ptr->details->min_cpus, job_ptr->details->min_nodes); } if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { @@ -1985,7 +1986,8 @@ alloc_job: job_res->nprocs = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->nprocs *= job_ptr->details->ntasks_per_node; - job_res->nprocs = MAX(job_res->nprocs, job_ptr->num_procs); + job_res->nprocs = MAX(job_res->nprocs, + job_ptr->details->min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * @@ -2062,9 +2064,9 @@ alloc_job: /* translate job_res->cpus array into format with rep count */ build_cnt = build_job_resources_cpu_array(job_res); if (build_cnt >= 0) - job_ptr->total_procs = build_cnt; + job_ptr->total_cpus = build_cnt; else - job_ptr->total_procs = total_cpus; /* best guess */ + job_ptr->total_cpus = total_cpus; /* best guess */ if ((cr_type != CR_CPU_MEMORY) && (cr_type != CR_CORE_MEMORY) && (cr_type != CR_SOCKET_MEMORY) && (cr_type != CR_MEMORY)) diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index ab528990f77fbad40ac65ff6cbca63795a7d9e7a..25c1d765754944d2bf74be4bdc772253391e06bd 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1614,7 +1614,7 @@ extern int select_p_block_init(List part_list) * NOTE: the job information that is considered for scheduling includes: * req_node_bitmap: bitmap of specific nodes required by the job * contiguous: allocated nodes must be sequentially located - * num_procs: minimum number of processors required by the job + * num_cpus: minimum number of processors required by the job * NOTE: bitmap must be a superset of req_nodes at the time that * select_p_job_test is called */ diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 7f3f13f305cc0b0ec47b1fd53a17ec77dbdacd73..eaddf4096829b5c6d15c5f56dcc906dbc94751a4 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -495,7 +495,7 @@ static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap) job_resrcs_ptr->node_bitmap = bit_copy(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); - job_resrcs_ptr->nprocs = job_ptr->total_procs; + job_resrcs_ptr->nprocs = job_ptr->total_cpus; if (build_job_resources(job_resrcs_ptr, (void *)select_node_ptr, select_fast_schedule)) error("_build_select_struct: build_job_resources: %m"); @@ -635,7 +635,7 @@ static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap, while ((job_scan_ptr = (struct job_record *) list_next(job_iterator))) { if ((!IS_JOB_RUNNING(job_scan_ptr)) || (job_scan_ptr->node_cnt != req_nodes) || - (job_scan_ptr->total_procs < job_ptr->num_procs) || + (job_scan_ptr->total_cpus < job_ptr->details->min_cpus) || (!bit_super_set(job_scan_ptr->node_bitmap, bitmap))) continue; if (job_scan_ptr->details && job_ptr->details && @@ -654,7 +654,7 @@ static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap, continue; /* Excluded nodes in this job */ bit_and(bitmap, job_scan_ptr->node_bitmap); - job_ptr->total_procs = job_scan_ptr->total_procs; + job_ptr->total_cpus = job_scan_ptr->total_cpus; rc = SLURM_SUCCESS; break; } @@ -709,7 +709,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, /* Build table with information about sets of consecutive nodes */ consec_cpus[consec_index] = consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ - rem_cpus = job_ptr->num_procs; + rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) rem_nodes = req_nodes; else @@ -910,8 +910,8 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, error_code = SLURM_SUCCESS; } if (error_code == SLURM_SUCCESS) { - /* job's total_procs is needed for SELECT_MODE_WILL_RUN */ - job_ptr->total_procs = total_cpus; + /* job's total_cpus is needed for SELECT_MODE_WILL_RUN */ + job_ptr->total_cpus = total_cpus; } xfree(consec_cpus); @@ -946,7 +946,7 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, int best_fit_location = 0, best_fit_sufficient; bool sufficient; - rem_cpus = job_ptr->num_procs; + rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) rem_nodes = req_nodes; else @@ -1204,8 +1204,8 @@ static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, rc = EINVAL; fini: if (rc == SLURM_SUCCESS) { - /* Job's total_procs is needed for SELECT_MODE_WILL_RUN */ - job_ptr->total_procs = total_cpus; + /* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */ + job_ptr->total_cpus = total_cpus; } FREE_NULL_BITMAP(avail_nodes_bitmap); FREE_NULL_BITMAP(req_nodes_bitmap); @@ -2028,7 +2028,7 @@ extern int select_p_block_init(List part_list) * NOTE: the job information that is considered for scheduling includes: * req_node_bitmap: bitmap of specific nodes required by the job * contiguous: allocated nodes must be sequentially located - * num_procs: minimum number of processors required by the job + * num_cpus: minimum number of processors required by the job * NOTE: bitmap must be a superset of the job's required at the time that * select_p_job_test is called */ diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c index 9c1afb38d3d4fa03a7b735ce10c6186bf3c9d7aa..5c70953a5d3534d95ad268dc25da161497b3ec5c 100644 --- a/src/plugins/task/affinity/dist_tasks.c +++ b/src/plugins/task/affinity/dist_tasks.c @@ -169,7 +169,7 @@ void batch_bind(batch_job_launch_msg_t *req) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; - uint16_t sockets=0, cores=0, num_procs; + uint16_t sockets=0, cores=0, num_cpus; int start, p, t, task_cnt=0; char *str; @@ -184,9 +184,9 @@ void batch_bind(batch_job_launch_msg_t *req) return; } - num_procs = MIN((sockets * cores), + num_cpus = MIN((sockets * cores), (conf->sockets * conf->cores)); - req_map = (bitstr_t *) bit_alloc(num_procs); + req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); if (!req_map || !hw_map) { error("task/affinity: malloc error"); @@ -204,14 +204,14 @@ void batch_bind(batch_job_launch_msg_t *req) * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.core_bitmap, p)) - bit_set(req_map, (p % num_procs)); + bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u CPU mask from slurmctld: %s", req->job_id, str); xfree(str); - for (p = 0; p < num_procs; p++) { + for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* core_bitmap does not include threads, so we @@ -221,7 +221,7 @@ void batch_bind(batch_job_launch_msg_t *req) uint16_t pos = p * conf->threads + t; if (pos >= conf->block_map_size) { info("more resources configured than exist"); - p = num_procs; + p = num_cpus; break; } bit_set(hw_map, pos); @@ -546,7 +546,7 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; - uint16_t p, t, num_procs, sockets, cores; + uint16_t p, t, num_cpus, sockets, cores; int job_node_id; int start; char *str; @@ -573,9 +573,9 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u", sockets, cores, *hw_sockets, *hw_cores, *hw_threads); - num_procs = MIN((sockets * cores), + num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores))); - req_map = (bitstr_t *) bit_alloc(num_procs); + req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); if (!req_map || !hw_map) { error("task/affinity: malloc error"); @@ -590,7 +590,7 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.core_bitmap, start+p)) - bit_set(req_map, (p % num_procs)); + bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); @@ -598,7 +598,7 @@ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, req->job_id, req->job_step_id, str); xfree(str); - for (p = 0; p < num_procs; p++) { + for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* core_bitmap does not include threads, so we diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index b4b75ac9349ca452cefde6123d11dba36c6076ef..f586bf4da4b498d1b77265dc598943e844813467 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -535,10 +535,10 @@ static int _fill_job_desc_from_opts(job_desc_msg_t *desc) if (opt.tmpdisk > -1) desc->job_min_tmp_disk = opt.tmpdisk; if (opt.overcommit) { - desc->num_procs = opt.min_nodes; + desc->min_cpus = opt.min_nodes; desc->overcommit = opt.overcommit; } else - desc->num_procs = opt.nprocs * opt.cpus_per_task; + desc->min_cpus = opt.nprocs * opt.cpus_per_task; if (opt.nprocs_set) desc->num_tasks = opt.nprocs; if (opt.cpus_set) diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 6560415096e8ca7d413674b39b2d1c259c0149bd..b1acd68cceb3ec5aa05fe4c5e01fda13c20f5d9c 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -269,10 +269,10 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) if (opt.tmpdisk > -1) desc->job_min_tmp_disk = opt.tmpdisk; if (opt.overcommit) { - desc->num_procs = MAX(opt.min_nodes, 1); + desc->min_cpus = MAX(opt.min_nodes, 1); desc->overcommit = opt.overcommit; } else - desc->num_procs = opt.nprocs * opt.cpus_per_task; + desc->min_cpus = opt.nprocs * opt.cpus_per_task; if (opt.nprocs_set) desc->num_tasks = opt.nprocs; if (opt.cpus_set) diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 976647edfe35a706e8ee004734e228a68d627dda..0cd7f702d0cd8493958fb9381da5433b0b7d43a5 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -335,7 +335,7 @@ scontrol_update_job (int argc, char *argv[]) /* ReqProcs was replaced by NumTasks in SLURM version 2.1 */ else if ((strncasecmp(tag, "ReqProcs", MAX(taglen, 4)) == 0) || (strncasecmp(tag, "NumTasks", MAX(taglen, 8)) == 0)) { - job_msg.num_procs = + job_msg.num_tasks = (uint32_t) strtol(val, (char **) NULL, 10); update_cnt++; } diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 0fcaba9601a8578acad8df836ffa40c0bea07b43..64e6252097884d22c0833e8dfe04a62d0fc1d7c7 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -243,7 +243,7 @@ extern void acct_policy_job_begin(struct job_record *job_ptr) list_append(qos_ptr->user_limit_list, used_limits); } qos_ptr->grp_used_jobs++; - qos_ptr->grp_used_cpus += job_ptr->total_procs; + qos_ptr->grp_used_cpus += job_ptr->total_cpus; qos_ptr->grp_used_nodes += job_ptr->node_cnt; used_limits->jobs++; slurm_mutex_unlock(&assoc_mgr_qos_lock); @@ -253,7 +253,7 @@ extern void acct_policy_job_begin(struct job_record *job_ptr) assoc_ptr = (acct_association_rec_t *)job_ptr->assoc_ptr; while(assoc_ptr) { assoc_ptr->used_jobs++; - assoc_ptr->grp_used_cpus += job_ptr->total_procs; + assoc_ptr->grp_used_cpus += job_ptr->total_cpus; assoc_ptr->grp_used_nodes += job_ptr->node_cnt; /* now handle all the group limits of the parents */ assoc_ptr = assoc_ptr->parent_assoc_ptr; @@ -301,7 +301,7 @@ extern void acct_policy_job_fini(struct job_record *job_ptr) debug2("acct_policy_job_fini: used_jobs underflow " "for qos %s", qos_ptr->name); - qos_ptr->grp_used_cpus -= job_ptr->total_procs; + qos_ptr->grp_used_cpus -= job_ptr->total_cpus; if((int)qos_ptr->grp_used_cpus < 0) { qos_ptr->grp_used_cpus = 0; debug2("acct_policy_job_fini: grp_used_cpus underflow " @@ -334,7 +334,7 @@ extern void acct_policy_job_fini(struct job_record *job_ptr) debug2("acct_policy_job_fini: used_jobs underflow " "for account %s", assoc_ptr->acct); - assoc_ptr->grp_used_cpus -= job_ptr->total_procs; + assoc_ptr->grp_used_cpus -= job_ptr->total_cpus; if ((int)assoc_ptr->grp_used_cpus < 0) { assoc_ptr->grp_used_cpus = 0; debug2("acct_policy_job_fini: grp_used_cpus underflow " diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index a256de111e25173b4f7868e06ef6e5608364825b..87e8410024c4ddc17fd7a06b624da0b6634f994d 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -153,7 +153,7 @@ void *acct_db_conn = NULL; int accounting_enforce = 0; int association_based_accounting = 0; bool ping_nodes_now = false; -int cluster_procs = 0; +int cluster_cpus = 0; /* Local variables */ static int daemonize = DEFAULT_DAEMONIZE; @@ -1016,7 +1016,7 @@ static int _accounting_cluster_ready() int i; int rc = SLURM_ERROR; time_t event_time = time(NULL); - int procs = 0; + int cpus = 0; bitstr_t *total_node_bitmap = NULL; char *cluster_nodes = NULL; @@ -1026,19 +1026,19 @@ static int _accounting_cluster_ready() continue; #ifdef SLURM_NODE_ACCT_REGISTER if (slurmctld_conf.fast_schedule) - procs += node_ptr->config_ptr->cpus; + cpus += node_ptr->config_ptr->cpus; else - procs += node_ptr->cpus; + cpus += node_ptr->cpus; #else - procs += node_ptr->config_ptr->cpus; + cpus += node_ptr->config_ptr->cpus; #endif } - /* Since cluster_procs is used else where we need to keep a - local var here to avoid race conditions on cluster_procs + /* Since cluster_cpus is used else where we need to keep a + local var here to avoid race conditions on cluster_cpus not being correct. */ - cluster_procs = procs; + cluster_cpus = cpus; /* Now get the names of all the nodes on the cluster at this time and send it also. @@ -1048,10 +1048,10 @@ static int _accounting_cluster_ready() cluster_nodes = bitmap2node_name(total_node_bitmap); FREE_NULL_BITMAP(total_node_bitmap); - rc = clusteracct_storage_g_cluster_procs(acct_db_conn, - slurmctld_cluster_name, - cluster_nodes, - cluster_procs, event_time); + rc = clusteracct_storage_g_cluster_cpus(acct_db_conn, + slurmctld_cluster_name, + cluster_nodes, + cluster_cpus, event_time); xfree(cluster_nodes); if(rc == ACCOUNTING_FIRST_REG) { /* see if we are running directly to a database @@ -1062,9 +1062,9 @@ static int _accounting_cluster_ready() } /* just incase the numbers change we need to - update the proc count on the cluster inside + update the cpu count on the cluster inside the priority plugin */ - priority_g_set_max_cluster_usage(cluster_procs, + priority_g_set_max_cluster_usage(cluster_cpus, slurmctld_conf.priority_decay_hl); return rc; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a67d58201b41e4b864a22300836a755d385eadf7..602f274c3c428f96a41ebc69684c65f9fca5a7fe 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -701,8 +701,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) pack32(dump_job_ptr->time_limit, buffer); pack32(dump_job_ptr->priority, buffer); pack32(dump_job_ptr->alloc_sid, buffer); - pack32(dump_job_ptr->num_procs, buffer); - pack32(dump_job_ptr->total_procs, buffer); + pack32(dump_job_ptr->total_cpus, buffer); pack32(dump_job_ptr->cpu_cnt, buffer); pack32(dump_job_ptr->exit_code, buffer); pack32(dump_job_ptr->db_index, buffer); @@ -789,8 +788,8 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) static int _load_job_state(Buf buffer, uint16_t protocol_version) { uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid; - uint32_t exit_code, num_procs, assoc_id, db_index, name_len; - uint32_t next_step_id, total_procs, cpu_cnt, + uint32_t exit_code, assoc_id, db_index, name_len; + uint32_t next_step_id, total_cpus, cpu_cnt, resv_id, spank_job_env_size = 0; time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time; time_t now = time(NULL); @@ -815,7 +814,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) acct_qos_rec_t qos_rec; bool job_finished = false; - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { safe_unpack32(&assoc_id, buffer); safe_unpack32(&job_id, buffer); @@ -842,8 +841,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack32(&time_limit, buffer); safe_unpack32(&priority, buffer); safe_unpack32(&alloc_sid, buffer); - safe_unpack32(&num_procs, buffer); - safe_unpack32(&total_procs, buffer); + safe_unpack32(&total_cpus, buffer); safe_unpack32(&cpu_cnt, buffer); safe_unpack32(&exit_code, buffer); safe_unpack32(&db_index, buffer); @@ -932,6 +930,136 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) } safe_unpack16(&step_flag, buffer); + while (step_flag == STEP_FLAG) { + /* No need to put these into accounting if they + * haven't been since all information will be + * put in when the job is finished. + */ + if ((error_code = load_step_state(job_ptr, buffer, + protocol_version))) + goto unpack_error; + safe_unpack16(&step_flag, buffer); + } + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + uint32_t min_cpus; + + safe_unpack32(&assoc_id, buffer); + safe_unpack32(&job_id, buffer); + + /* validity test as possible */ + if (job_id == 0) { + verbose("Invalid job_id %u", job_id); + goto unpack_error; + } + + job_ptr = find_job_record(job_id); + if (job_ptr == NULL) { + job_ptr = create_job_record(&error_code); + if (error_code) { + error("Create job entry failed for job_id %u", + job_id); + goto unpack_error; + } + job_ptr->job_id = job_id; + _add_job_hash(job_ptr); + } + + safe_unpack32(&user_id, buffer); + safe_unpack32(&group_id, buffer); + safe_unpack32(&time_limit, buffer); + safe_unpack32(&priority, buffer); + safe_unpack32(&alloc_sid, buffer); + safe_unpack32(&min_cpus, buffer); + safe_unpack32(&total_cpus, buffer); + safe_unpack32(&cpu_cnt, buffer); + safe_unpack32(&exit_code, buffer); + safe_unpack32(&db_index, buffer); + safe_unpack32(&assoc_id, buffer); + safe_unpack32(&resv_id, buffer); + safe_unpack32(&next_step_id, buffer); + + safe_unpack_time(&start_time, buffer); + safe_unpack_time(&end_time, buffer); + safe_unpack_time(&suspend_time, buffer); + safe_unpack_time(&pre_sus_time, buffer); + safe_unpack_time(&tot_sus_time, buffer); + + safe_unpack16(&direct_set_prio, buffer); + safe_unpack16(&job_state, buffer); + safe_unpack16(&kill_on_node_fail, buffer); + safe_unpack16(&kill_on_step_done, buffer); + safe_unpack16(&batch_flag, buffer); + safe_unpack16(&mail_type, buffer); + safe_unpack16(&qos, buffer); + safe_unpack16(&state_reason, buffer); + safe_unpack16(&restart_cnt, buffer); + safe_unpack16(&resv_flags, buffer); + safe_unpack16(&warn_signal, buffer); + safe_unpack16(&warn_time, buffer); + + safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); + safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); + + safe_unpack16(&alloc_resp_port, buffer); + safe_unpack16(&other_port, buffer); + + if (job_state & JOB_COMPLETING) { + safe_unpackstr_xmalloc(&nodes_completing, + &name_len, buffer); + } + safe_unpackstr_xmalloc(&nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&partition, &name_len, buffer); + if (partition == NULL) { + error("No partition for job %u", job_id); + goto unpack_error; + } + part_ptr = find_part_record (partition); + if (part_ptr == NULL) { + verbose("Invalid partition (%s) for job_id %u", + partition, job_id); + /* not a fatal error, partition could have been removed, + * reset_job_bitmaps() will clean-up this job */ + } + + + safe_unpackstr_xmalloc(&name, &name_len, buffer); + safe_unpackstr_xmalloc(&wckey, &name_len, buffer); + safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); + safe_unpackstr_xmalloc(&account, &name_len, buffer); + safe_unpackstr_xmalloc(&comment, &name_len, buffer); + safe_unpackstr_xmalloc(&network, &name_len, buffer); + safe_unpackstr_xmalloc(&licenses, &name_len, buffer); + safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); + safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); + + if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, + protocol_version)) + goto unpack_error; + if (unpack_job_resources(&job_resources, NULL, buffer, + protocol_version)) + goto unpack_error; + + safe_unpack16(&ckpt_interval, buffer); + if (checkpoint_alloc_jobinfo(&check_job) || + checkpoint_unpack_jobinfo(check_job, buffer)) + goto unpack_error; + + safe_unpackstr_array(&spank_job_env, &spank_job_env_size, + buffer); + + safe_unpack16(&details, buffer); + if ((details == DETAILS_FLAG) && + (_load_job_details(job_ptr, buffer, protocol_version))) { + job_ptr->job_state = JOB_FAILED; + job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; + xfree(job_ptr->state_desc); + job_ptr->end_time = now; + goto unpack_error; + } + safe_unpack16(&step_flag, buffer); + job_ptr->details->min_cpus = min_cpus; + while (step_flag == STEP_FLAG) { /* No need to put these into accounting if they * haven't been since all information will be @@ -1015,7 +1143,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) job_ptr->nodes_completing = nodes_completing; nodes_completing = NULL; /* reused, nothing left to free */ } - job_ptr->num_procs = num_procs; job_ptr->other_port = other_port; xfree(job_ptr->partition); job_ptr->partition = partition; @@ -1045,7 +1172,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) job_ptr->suspend_time = suspend_time; job_ptr->time_last_active = now; job_ptr->time_limit = time_limit; - job_ptr->total_procs = total_procs; + job_ptr->total_cpus = total_cpus; job_ptr->cpu_cnt = cpu_cnt; job_ptr->tot_sus_time = tot_sus_time; job_ptr->user_id = user_id; @@ -1159,6 +1286,8 @@ unpack_error: */ void _dump_job_details(struct job_details *detail_ptr, Buf buffer) { + pack32(detail_ptr->min_cpus, buffer); + pack32(detail_ptr->max_cpus, buffer); pack32(detail_ptr->min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); pack32(detail_ptr->num_tasks, buffer); @@ -1215,6 +1344,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer, char *ckpt_dir = NULL, *restart_dir = NULL; char **argv = (char **) NULL, **env_sup = (char **) NULL; uint32_t min_nodes, max_nodes; + uint32_t min_cpus = 1, max_cpus = NO_VAL; uint32_t job_min_cpus, job_min_memory, job_min_tmp_disk; uint32_t num_tasks, name_len, argc = 0, env_cnt = 0; uint16_t shared, contiguous, nice, ntasks_per_node; @@ -1226,7 +1356,55 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer, multi_core_data_t *mc_ptr; /* unpack the job's details from the buffer */ - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + safe_unpack32(&min_cpus, buffer); + safe_unpack32(&max_cpus, buffer); + safe_unpack32(&min_nodes, buffer); + safe_unpack32(&max_nodes, buffer); + safe_unpack32(&num_tasks, buffer); + + safe_unpack16(&acctg_freq, buffer); + safe_unpack16(&contiguous, buffer); + safe_unpack16(&cpus_per_task, buffer); + safe_unpack16(&nice, buffer); + safe_unpack16(&ntasks_per_node, buffer); + safe_unpack16(&requeue, buffer); + safe_unpack16(&shared, buffer); + safe_unpack16(&task_dist, buffer); + + safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); + safe_unpack16(&cpu_bind_type, buffer); + safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); + safe_unpack16(&mem_bind_type, buffer); + safe_unpack16(&plane_size, buffer); + + safe_unpack8(&open_mode, buffer); + safe_unpack8(&overcommit, buffer); + safe_unpack8(&prolog_running, buffer); + + safe_unpack32(&job_min_cpus, buffer); + safe_unpack32(&job_min_memory, buffer); + safe_unpack32(&job_min_tmp_disk, buffer); + safe_unpack_time(&begin_time, buffer); + safe_unpack_time(&submit_time, buffer); + + safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&features, &name_len, buffer); + safe_unpackstr_xmalloc(&dependency, &name_len, buffer); + + safe_unpackstr_xmalloc(&err, &name_len, buffer); + safe_unpackstr_xmalloc(&in, &name_len, buffer); + safe_unpackstr_xmalloc(&out, &name_len, buffer); + safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); + safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer); + safe_unpackstr_xmalloc(&restart_dir, &name_len, buffer); + + if (unpack_multi_core_data(&mc_ptr, buffer)) + goto unpack_error; + safe_unpackstr_array(&argv, &argc, buffer); + safe_unpackstr_array(&env_sup, &env_cnt, buffer); + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { safe_unpack32(&min_nodes, buffer); safe_unpack32(&max_nodes, buffer); safe_unpack32(&num_tasks, buffer); @@ -1330,10 +1508,12 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer, job_ptr->details->job_min_cpus = job_min_cpus; job_ptr->details->job_min_memory = job_min_memory; job_ptr->details->job_min_tmp_disk = job_min_tmp_disk; + job_ptr->details->max_cpus = max_cpus; job_ptr->details->max_nodes = max_nodes; job_ptr->details->mc_ptr = mc_ptr; job_ptr->details->mem_bind = mem_bind; job_ptr->details->mem_bind_type = mem_bind_type; + job_ptr->details->min_cpus = min_cpus; job_ptr->details->min_nodes = min_nodes; job_ptr->details->nice = nice; job_ptr->details->ntasks_per_node = ntasks_per_node; @@ -1679,7 +1859,7 @@ extern void excise_node_from_job(struct job_record *job_ptr, void dump_job_desc(job_desc_msg_t * job_specs) { long job_id; - long job_min_cpus, job_min_memory, job_min_tmp_disk, num_procs; + long job_min_cpus, job_min_memory, job_min_tmp_disk, min_cpus; long time_limit, priority, contiguous, acctg_freq; long kill_on_node_fail, shared, immediate; long cpus_per_task, requeue, num_tasks, overcommit; @@ -1695,11 +1875,11 @@ void dump_job_desc(job_desc_msg_t * job_specs) job_specs->user_id, job_id, job_specs->partition, job_specs->name); - num_procs = (job_specs->num_procs != NO_VAL) ? - (long) job_specs->num_procs : -1L; + min_cpus = (job_specs->min_cpus != NO_VAL) ? + (long) job_specs->min_cpus : -1L; job_min_cpus = (job_specs->job_min_cpus != (uint16_t) NO_VAL) ? (long) job_specs->job_min_cpus : -1L; - debug3(" num_procs=%ld job_min_procs=%ld", num_procs, job_min_cpus); + debug3(" min_cpus=%ld job_min_procs=%ld", min_cpus, job_min_cpus); debug3(" -N min-[max]: %u-[%u]:%u:%u:%u", job_specs->min_nodes, job_specs->max_nodes, @@ -2582,13 +2762,13 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, */ debug3("before alteration asking for nodes %u-%u cpus %u", job_desc->min_nodes, job_desc->max_nodes, - job_desc->num_procs); + job_desc->min_cpus); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_desc); select_g_select_jobinfo_get(job_desc->select_jobinfo, SELECT_JOBDATA_MAX_CPUS, &max_cpus); debug3("after alteration asking for nodes %u-%u cpus %u-%u", job_desc->min_nodes, job_desc->max_nodes, - job_desc->num_procs, max_cpus); + job_desc->min_cpus, max_cpus); #endif /* check if select partition has sufficient resources to satisfy * the request */ @@ -2614,8 +2794,8 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, i = bit_set_count(req_bitmap); if (i > job_desc->min_nodes) job_desc->min_nodes = i; - if (i > job_desc->num_procs) - job_desc->num_procs = i; + if (i > job_desc->min_cpus) + job_desc->min_cpus = i; if(job_desc->max_nodes && job_desc->min_nodes > job_desc->max_nodes) job_desc->max_nodes = job_desc->min_nodes; @@ -2690,14 +2870,14 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, SELECT_JOBDATA_CONN_TYPE, &conn_type); } else if(((conn_type >= SELECT_SMALL) - && (job_desc->num_procs >= cpus_per_bp)) + && (job_desc->min_cpus >= cpus_per_bp)) || (((conn_type == SELECT_TORUS)|| (conn_type == SELECT_MESH)) - && (job_desc->num_procs < cpus_per_bp))) { + && (job_desc->min_cpus < cpus_per_bp))) { /* check to make sure we have a valid conn_type with * the cpu count */ info("Job's cpu count at %u makes our conn_type " "of '%s' invalid.", - job_desc->num_procs, conn_type_string(conn_type)); + job_desc->min_cpus, conn_type_string(conn_type)); error_code = ESLURM_INVALID_NODE_COUNT; goto cleanup_fail; } @@ -2706,9 +2886,9 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, if (job_desc->max_nodes == NO_VAL) job_desc->max_nodes = 0; if ((part_ptr->state_up) - && (job_desc->num_procs > part_ptr->total_cpus)) { + && (job_desc->min_cpus > part_ptr->total_cpus)) { info("Job requested too many cpus (%u) of partition %s(%u)", - job_desc->num_procs, part_ptr->name, + job_desc->min_cpus, part_ptr->name, part_ptr->total_cpus); error_code = ESLURM_TOO_MANY_REQUESTED_CPUS; goto cleanup_fail; @@ -3454,8 +3634,7 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, job_ptr->alloc_resp_port = job_desc->alloc_resp_port; job_ptr->other_port = job_desc->other_port; job_ptr->time_last_active = time(NULL); - job_ptr->num_procs = job_desc->num_procs; - job_ptr->cr_enabled = 0; + job_ptr->cr_enabled = 0; job_ptr->licenses = xstrdup(job_desc->licenses); job_ptr->mail_type = job_desc->mail_type; @@ -3478,7 +3657,9 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, detail_ptr->acctg_freq = job_desc->acctg_freq; detail_ptr->nice = job_desc->nice; detail_ptr->open_mode = job_desc->open_mode; - detail_ptr->min_nodes = job_desc->min_nodes; + detail_ptr->min_cpus = job_desc->min_cpus; + detail_ptr->max_cpus = job_desc->max_cpus; + detail_ptr->min_nodes = job_desc->min_nodes; detail_ptr->max_nodes = job_desc->max_nodes; if (job_desc->req_nodes) { detail_ptr->req_nodes = @@ -3612,8 +3793,8 @@ static bool _valid_job_min_mem(job_desc_msg_t * job_desc_msg) cpus_per_node = node_record_table_ptr[0].config_ptr->cpus; else cpus_per_node = node_record_table_ptr[0].cpus; - if (job_desc_msg->num_procs != NO_VAL) - cpus_per_node = MIN(cpus_per_node, job_desc_msg->num_procs); + if (job_desc_msg->min_cpus != NO_VAL) + cpus_per_node = MIN(cpus_per_node, job_desc_msg->min_cpus); if (base_size & MEM_PER_CPU) { base_size &= (~MEM_PER_CPU); base_size *= cpus_per_node; @@ -3692,7 +3873,7 @@ void job_time_limit(void) job_cpu_usage_mins = (uint64_t) ((((now - job_ptr->start_time) - job_ptr->tot_sus_time) / 60) - * job_ptr->total_procs); + * job_ptr->total_cpus); /* Consider a job active if it has any active steps */ if (job_ptr->step_list && @@ -3947,10 +4128,10 @@ static void _job_timed_out(struct job_record *job_ptr) static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, uid_t submit_uid) { - if ((job_desc_msg->num_procs == NO_VAL) + if ((job_desc_msg->min_cpus == NO_VAL) && (job_desc_msg->min_nodes == NO_VAL) && (job_desc_msg->req_nodes == NULL)) { - info("Job specified no num_procs, min_nodes or req_nodes"); + info("Job specified no min_cpus, min_nodes or req_nodes"); return ESLURM_JOB_MISSING_SIZE_SPECIFICATION; } if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) && @@ -4027,8 +4208,8 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, job_desc_msg->min_threads = 1; /* default thread count of 1 */ if (job_desc_msg->min_nodes == NO_VAL) job_desc_msg->min_nodes = 1; /* default node count of 1 */ - if (job_desc_msg->num_procs == NO_VAL) - job_desc_msg->num_procs = job_desc_msg->min_nodes; + if (job_desc_msg->min_cpus == NO_VAL) + job_desc_msg->min_cpus = job_desc_msg->min_nodes; if (job_desc_msg->min_sockets == (uint16_t) NO_VAL) job_desc_msg->min_sockets = 1; /* default socket count of 1 */ if (job_desc_msg->min_cores == (uint16_t) NO_VAL) @@ -4284,7 +4465,107 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, time_t begin_time = 0; char *nodelist = NULL; - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + pack32(dump_job_ptr->assoc_id, buffer); + pack32(dump_job_ptr->job_id, buffer); + pack32(dump_job_ptr->user_id, buffer); + pack32(dump_job_ptr->group_id, buffer); + + pack16(dump_job_ptr->job_state, buffer); + pack16(dump_job_ptr->batch_flag, buffer); + pack16(dump_job_ptr->state_reason, buffer); + pack16(dump_job_ptr->restart_cnt, buffer); + + pack32(dump_job_ptr->alloc_sid, buffer); + if ((dump_job_ptr->time_limit == NO_VAL) + && dump_job_ptr->part_ptr) + pack32(dump_job_ptr->part_ptr->max_time, buffer); + else + pack32(dump_job_ptr->time_limit, buffer); + + if (dump_job_ptr->details) { + pack16(dump_job_ptr->details->nice, buffer); + pack_time(dump_job_ptr->details->submit_time, buffer); + /* Earliest possible begin time */ + begin_time = dump_job_ptr->details->begin_time; + } else { + pack16(0, buffer); + pack_time((time_t) 0, buffer); + } + + pack_time(begin_time, buffer); + + /* Actual or expected start time */ + if(dump_job_ptr->start_time >= begin_time) + pack_time(dump_job_ptr->start_time, buffer); + else + pack_time(begin_time, buffer); + + pack_time(dump_job_ptr->end_time, buffer); + pack_time(dump_job_ptr->suspend_time, buffer); + pack_time(dump_job_ptr->pre_sus_time, buffer); + pack32(dump_job_ptr->priority, buffer); + + /* Only send the allocated nodelist since we are only sending + * the number of cpus and nodes that are currently allocated. */ + if(!IS_JOB_COMPLETING(dump_job_ptr)) + packstr(dump_job_ptr->nodes, buffer); + else { + nodelist = bitmap2node_name(dump_job_ptr->node_bitmap); + packstr(nodelist, buffer); + xfree(nodelist); + } + + packstr(dump_job_ptr->partition, buffer); + packstr(dump_job_ptr->account, buffer); + packstr(dump_job_ptr->network, buffer); + packstr(dump_job_ptr->comment, buffer); + + slurm_mutex_lock(&assoc_mgr_qos_lock); + if (assoc_mgr_qos_list) + packstr(acct_qos_str(assoc_mgr_qos_list, + dump_job_ptr->qos), + buffer); + else + packnull(buffer); + slurm_mutex_unlock(&assoc_mgr_qos_lock); + + packstr(dump_job_ptr->licenses, buffer); + packstr(dump_job_ptr->state_desc, buffer); + packstr(dump_job_ptr->resv_name, buffer); + + pack32(dump_job_ptr->exit_code, buffer); + + if (show_flags & SHOW_DETAIL) { + pack_job_resources(dump_job_ptr->job_resrcs, buffer, + protocol_version); + } else { + uint32_t empty = NO_VAL; + pack32(empty, buffer); + } + + packstr(dump_job_ptr->name, buffer); + packstr(dump_job_ptr->wckey, buffer); + packstr(dump_job_ptr->alloc_node, buffer); + pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); + + select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, + buffer, protocol_version); + + detail_ptr = dump_job_ptr->details; + /* A few details are always dumped here */ + _pack_default_job_details(dump_job_ptr, buffer, + protocol_version); + + /* other job details are only dumped until the job starts + * running (at which time they become meaningless) */ + if (detail_ptr) + _pack_pending_job_details(detail_ptr, buffer, + protocol_version); + else + _pack_pending_job_details(NULL, buffer, + protocol_version); + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { pack32(dump_job_ptr->assoc_id, buffer); pack32(dump_job_ptr->job_id, buffer); pack32(dump_job_ptr->user_id, buffer); @@ -4368,17 +4649,19 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, packstr(dump_job_ptr->alloc_node, buffer); pack_bit_fmt(dump_job_ptr->node_bitmap, buffer); + detail_ptr = dump_job_ptr->details; if (IS_JOB_COMPLETING(dump_job_ptr) && dump_job_ptr->cpu_cnt) pack32(dump_job_ptr->cpu_cnt, buffer); - else if (dump_job_ptr->total_procs) - pack32(dump_job_ptr->total_procs, buffer); + else if (dump_job_ptr->total_cpus) + pack32(dump_job_ptr->total_cpus, buffer); + else if(detail_ptr) + pack32(detail_ptr->min_cpus, buffer); else - pack32(dump_job_ptr->num_procs, buffer); + pack32(dump_job_ptr->cpu_cnt, buffer); select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, buffer, protocol_version); - detail_ptr = dump_job_ptr->details; /* A few details are always dumped here */ _pack_default_job_details(dump_job_ptr, buffer, protocol_version); @@ -4402,7 +4685,58 @@ static void _pack_default_job_details(struct job_record *job_ptr, struct job_details *detail_ptr = job_ptr->details; char *cmd_line = NULL; - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + if (detail_ptr) { + packstr(detail_ptr->features, buffer); + packstr(detail_ptr->work_dir, buffer); + packstr(detail_ptr->dependency, buffer); + if (detail_ptr->argv) { + for (i=0; detail_ptr->argv[i]; i++) { + if (cmd_line) + xstrcat(cmd_line, " "); + xstrcat(cmd_line, detail_ptr->argv[i]); + } + packstr(cmd_line, buffer); + xfree(cmd_line); + } else + packnull(buffer); + + if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { + pack32(job_ptr->cpu_cnt, buffer); + pack32((uint32_t) 0, buffer); + } else if (job_ptr->total_cpus) { + pack32(job_ptr->total_cpus, buffer); + pack32((uint32_t) 0, buffer); + } else { + pack32(detail_ptr->min_cpus, buffer); + pack32(detail_ptr->max_cpus, buffer); + } + + if (job_ptr->node_cnt) { + pack32(job_ptr->node_cnt, buffer); + pack32((uint32_t) 0, buffer); + } else { + pack32(detail_ptr->min_nodes, buffer); + pack32(detail_ptr->max_nodes, buffer); + } + pack16(detail_ptr->requeue, buffer); + } else { + packnull(buffer); + packnull(buffer); + packnull(buffer); + packnull(buffer); + + if (job_ptr->total_cpus) + pack32(job_ptr->total_cpus, buffer); + else + pack32(job_ptr->cpu_cnt, buffer); + pack32((uint32_t) 0, buffer); + + pack32(job_ptr->node_cnt, buffer); + pack32((uint32_t) 0, buffer); + pack16((uint16_t) 0, buffer); + } + } else if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { if (detail_ptr) { packstr(detail_ptr->features, buffer); packstr(detail_ptr->work_dir, buffer); @@ -4895,6 +5229,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) int error_code = SLURM_SUCCESS; int super_user = 0; uint32_t save_min_nodes = NO_VAL, save_max_nodes = NO_VAL, max_cpus; + uint32_t save_min_cpus = NO_VAL, save_max_cpus = NO_VAL; struct job_record *job_ptr; struct job_details *detail_ptr; struct part_record *tmp_part_ptr; @@ -5056,34 +5391,64 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) */ debug3("update before alteration asking for nodes %u-%u cpus %u", job_specs->min_nodes, job_specs->max_nodes, - job_specs->num_procs); + job_specs->min_cpus); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_specs); select_g_select_jobinfo_get(job_specs->select_jobinfo, SELECT_JOBDATA_MAX_CPUS, &max_cpus); debug3("update after alteration asking for nodes %u-%u cpus %u-%u", job_specs->min_nodes, job_specs->max_nodes, - job_specs->num_procs, max_cpus); + job_specs->min_cpus, max_cpus); - if (job_specs->num_procs != NO_VAL) { - if (!IS_JOB_PENDING(job_ptr)) + /* Reset min and max cpu counts as needed, insure consistency */ + if (job_specs->min_cpus != NO_VAL) { + if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) + error_code = ESLURM_DISABLED; + else if (job_specs->min_cpus < 1) + error_code = ESLURM_INVALID_CPU_COUNT; + else { + save_min_cpus = detail_ptr->min_cpus; + detail_ptr->min_cpus = job_specs->min_cpus; + } + } + if (job_specs->max_cpus != NO_VAL) { + if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_DISABLED; - else if (job_specs->num_procs < 1) - error_code = ESLURM_BAD_TASK_COUNT; else { + save_max_cpus = detail_ptr->max_cpus; + detail_ptr->max_cpus = job_specs->max_cpus; + } + } + if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus && + (detail_ptr->max_cpus < detail_ptr->min_cpus)) { + error_code = ESLURM_INVALID_CPU_COUNT; + if (save_min_cpus) { + detail_ptr->min_cpus = save_min_cpus; + save_min_cpus = NO_VAL; + } + if (save_max_cpus) { + detail_ptr->max_cpus = save_max_cpus; + save_max_cpus = NO_VAL; + } + } + if (save_min_cpus != NO_VAL) { #ifdef HAVE_BG - uint32_t node_cnt = job_specs->num_procs; - if(cpus_per_node) - node_cnt /= cpus_per_node; - select_g_select_jobinfo_set(job_ptr->select_jobinfo, - SELECT_JOBDATA_NODE_CNT, - &node_cnt); + uint32_t node_cnt = detail_ptr->min_cpus; + if(cpus_per_node) + node_cnt /= cpus_per_node; + select_g_select_jobinfo_set(job_ptr->select_jobinfo, + SELECT_JOBDATA_NODE_CNT, + &node_cnt); #endif - job_ptr->num_procs = job_specs->num_procs; - info("update_job: setting num_procs to %u for " - "job_id %u", job_specs->num_procs, - job_specs->job_id); - update_accounting = true; - } + info("update_job: setting min_cpus from " + "%u to %u for job_id %u", + save_min_cpus, detail_ptr->min_cpus, job_specs->job_id); + update_accounting = true; + } + if (save_max_cpus != NO_VAL) { + info("update_job: setting max_cpus from " + "%u to %u for job_id %u", + save_max_cpus, detail_ptr->max_cpus, job_specs->job_id); + update_accounting = true; } if (job_specs->job_min_cpus != (uint16_t) NO_VAL) { @@ -5103,6 +5468,40 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } } + if (job_specs->num_tasks != NO_VAL) { + if (!IS_JOB_PENDING(job_ptr)) + error_code = ESLURM_DISABLED; + else if (job_specs->num_tasks < 1) + error_code = ESLURM_BAD_TASK_COUNT; + else { +#ifdef HAVE_BG + uint32_t node_cnt = job_specs->num_tasks; + if(cpus_per_node) + node_cnt /= cpus_per_node; + select_g_select_jobinfo_set(job_ptr->select_jobinfo, + SELECT_JOBDATA_NODE_CNT, + &node_cnt); +#endif + detail_ptr->num_tasks = job_specs->num_tasks; + info("update_job: setting num_tasks to %u for " + "job_id %u", job_specs->num_tasks, + job_specs->job_id); + if(detail_ptr->cpus_per_task) { + uint32_t new_cpus = detail_ptr->num_tasks + / detail_ptr->cpus_per_task; + if((new_cpus < detail_ptr->min_cpus) + || (!detail_ptr->overcommit + && (new_cpus > detail_ptr->min_cpus))) { + detail_ptr->min_cpus = new_cpus; + info("update_job: setting " + "num_procs to %u for " + "job_id %u", detail_ptr->min_cpus, + job_specs->job_id); + } + } + } + } + /* Reset min and max node counts as needed, insure consistency */ if (job_specs->min_nodes != NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) @@ -5616,28 +6015,28 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) select_g_select_jobinfo_get(job_specs->select_jobinfo, SELECT_JOBDATA_CONN_TYPE, &conn_type); if (conn_type != (uint16_t) NO_VAL) { - if (!IS_JOB_PENDING(job_ptr)) + if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_DISABLED; else { if((conn_type >= SELECT_SMALL) - && (job_ptr->num_procs >= cpus_per_bp)) { + && (detail_ptr->min_cpus >= cpus_per_bp)) { info("update_job: could not change " "conn_type to '%s' because cpu " "count is %u for job %u making " "the conn_type invalid.", conn_type_string(conn_type), - job_ptr->num_procs, + detail_ptr->min_cpus, job_ptr->job_id); error_code = ESLURM_INVALID_NODE_COUNT; } else if(((conn_type == SELECT_TORUS) || (conn_type == SELECT_MESH)) - && (job_ptr->num_procs < cpus_per_bp)) { + && (detail_ptr->min_cpus < cpus_per_bp)) { info("update_job: could not change " "conn_type to '%s' because cpu " "count is %u for job %u making " "the conn_type invalid.", conn_type_string(conn_type), - job_ptr->num_procs, + detail_ptr->min_cpus, job_ptr->job_id); error_code = ESLURM_INVALID_NODE_COUNT; } else { @@ -5654,13 +6053,14 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) /* check to make sure we didn't mess up with the proc count */ select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_CONN_TYPE, &conn_type); - if(((conn_type >= SELECT_SMALL) - && (job_ptr->num_procs >= cpus_per_bp)) - || (((conn_type == SELECT_TORUS)|| (conn_type == SELECT_MESH)) - && (job_ptr->num_procs < cpus_per_bp))) { + if(detail_ptr && + (((conn_type >= SELECT_SMALL) + && (detail_ptr->min_cpus >= cpus_per_bp)) + || (((conn_type == SELECT_TORUS)|| (conn_type == SELECT_MESH)) + && (detail_ptr->min_cpus < cpus_per_bp)))) { info("update_job: With cpu count at %u our conn_type " "of '%s' is invalid for job %u.", - job_ptr->num_procs, + detail_ptr->min_cpus, conn_type_string(conn_type), job_ptr->job_id); error_code = ESLURM_INVALID_NODE_COUNT; @@ -7892,7 +8292,8 @@ _copy_job_record_to_job_desc(struct job_record *job_ptr) job_desc->job_min_cpus = details->job_min_cpus; job_desc->job_min_memory = details->job_min_memory; job_desc->job_min_tmp_disk = details->job_min_tmp_disk; - job_desc->num_procs = job_ptr->num_procs; + job_desc->min_cpus = details->min_cpus; + job_desc->max_cpus = details->max_cpus; job_desc->min_nodes = details->min_nodes; job_desc->max_nodes = details->max_nodes; job_desc->min_sockets = mc_ptr->min_sockets; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 94f8427abb4eedd4635e4a101866772d6f313a30..49b5f3f74ccc286501fe63942450167be8f1e789 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -1037,7 +1037,7 @@ extern int job_start_data(job_desc_msg_t *job_desc_msg, &resp_data->proc_cnt); #else - resp_data->proc_cnt = job_ptr->total_procs; + resp_data->proc_cnt = job_ptr->total_cpus; #endif resp_data->start_time = MAX(job_ptr->start_time, start_res); job_ptr->start_time = 0; /* restore pending job start time */ diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 8617a198c424f6498a85545701d9b2fa45199ceb..86395fdfe61150cef2d4f7a6b561fdf556ad3723 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -352,7 +352,7 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, { uint32_t saved_min_nodes, saved_job_min_nodes; bitstr_t *saved_req_node_bitmap = NULL; - uint32_t saved_num_procs, saved_req_nodes; + uint32_t saved_min_cpus, saved_max_cpus, saved_req_nodes; int rc, tmp_node_set_size; struct node_set *tmp_node_set_ptr; int error_code = SLURM_SUCCESS, i; @@ -389,8 +389,10 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, saved_req_node_bitmap = bit_copy(accumulate_bitmap); job_ptr->details->req_node_bitmap = NULL; } - saved_num_procs = job_ptr->num_procs; - job_ptr->num_procs = 1; + saved_min_cpus = job_ptr->details->min_cpus; + saved_max_cpus = job_ptr->details->max_cpus; + job_ptr->details->min_cpus = 1; + job_ptr->details->max_cpus = 1; tmp_node_set_ptr = xmalloc(sizeof(struct node_set) * node_set_size); /* Accumulate nodes with required feature counts. @@ -437,7 +439,7 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, min_nodes = feat_ptr->count; req_nodes = feat_ptr->count; job_ptr->details->min_nodes = feat_ptr->count; - job_ptr->num_procs = feat_ptr->count; + job_ptr->details->min_cpus = feat_ptr->count; if (*preemptee_job_list) { list_destroy(*preemptee_job_list); *preemptee_job_list = NULL; @@ -502,7 +504,7 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, } else job_ptr->details->req_node_bitmap = accumulate_bitmap; node_cnt = bit_set_count(job_ptr->details->req_node_bitmap); - job_ptr->num_procs = MAX(saved_num_procs, node_cnt); + job_ptr->details->min_cpus = MAX(saved_min_cpus, node_cnt); min_nodes = MAX(saved_min_nodes, node_cnt); job_ptr->details->min_nodes = min_nodes; req_nodes = MAX(min_nodes, req_nodes); @@ -511,7 +513,8 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, } else { min_nodes = saved_min_nodes; req_nodes = saved_req_nodes; - job_ptr->num_procs = saved_num_procs; + job_ptr->details->min_cpus = saved_min_cpus; + job_ptr->details->max_cpus = saved_max_cpus; job_ptr->details->min_nodes = saved_job_min_nodes; } #if 0 @@ -548,7 +551,7 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, /* restore job's initial required node bitmap */ FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = saved_req_node_bitmap; - job_ptr->num_procs = saved_num_procs; + job_ptr->details->min_cpus = saved_min_cpus; job_ptr->details->min_nodes = saved_job_min_nodes; /* Restore available node bitmap, ignoring reservations */ @@ -666,9 +669,9 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, FREE_NULL_BITMAP(partially_idle_node_bitmap); return error_code; } - debug3("Job %u shared %d CR type %d num_procs %d nbits %d", + debug3("Job %u shared %d CR type %d cpus %u-%u nbits %d", job_ptr->job_id, shared, cr_enabled, cr_type, - job_ptr->num_procs, + job_ptr->details->min_cpus, job_ptr->details->max_cpus, bit_set_count(partially_idle_node_bitmap)); } @@ -1129,8 +1132,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, &preemptee_job_list); } /* set up the cpu_cnt here so we can decrement it as nodes - free up. total_procs is set within _get_req_features */ - job_ptr->cpu_cnt = job_ptr->total_procs; + free up. total_cpus is set within _get_req_features */ + job_ptr->cpu_cnt = job_ptr->total_cpus; if (!test_only && preemptee_job_list && (error_code == SLURM_SUCCESS)) _preempt_jobs(preemptee_job_list, &error_code); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index c778492ebe68bca52bd49571aa6cc549dab1d0cf..9ae29d59833963d8d078b1bffb0c552b28d8a750 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -3299,15 +3299,15 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, /* _max_nprocs() represents the total number of CPUs available * for this step (overcommit not supported yet). If job_desc_msg - * contains a reasonable num_procs request, use that value; + * contains a reasonable min_cpus request, use that value; * otherwise default to the allocation processor request. */ - launch_msg_ptr->nprocs = job_ptr->total_procs; - if (job_desc_msg->num_procs > 0 && - job_desc_msg->num_procs < launch_msg_ptr->nprocs) - launch_msg_ptr->nprocs = job_desc_msg->num_procs; + launch_msg_ptr->nprocs = job_ptr->total_cpus; + if (job_desc_msg->min_cpus > 0 && + job_desc_msg->min_cpus < launch_msg_ptr->nprocs) + launch_msg_ptr->nprocs = job_desc_msg->min_cpus; if (launch_msg_ptr->nprocs < 0) - launch_msg_ptr->nprocs = job_ptr->num_procs; + launch_msg_ptr->nprocs = job_ptr->cpu_cnt; launch_msg_ptr->num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt; launch_msg_ptr->cpus_per_node = xmalloc(sizeof(uint16_t) * diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 3c6a85ed2a520a71a2ea3fc51efce09f0a6fcf1b..35fb22b1931cbe6f308b0512e32ed236dfa543b4 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -160,7 +160,7 @@ extern char *slurmctld_cluster_name; /* name of cluster */ extern void *acct_db_conn; extern int accounting_enforce; extern int association_based_accounting; -extern int cluster_procs; +extern int cluster_cpus; /*****************************************************************************\ * NODE parameters and data structures, mostly in src/common/node_conf.h @@ -324,10 +324,12 @@ struct job_details { * CPU | MEM_PER_CPU */ uint32_t job_min_tmp_disk; /* minimum tempdisk per node, MB */ uint32_t magic; /* magic cookie for data integrity */ + uint32_t max_cpus; /* maximum number of cpus */ uint32_t max_nodes; /* maximum number of nodes */ multi_core_data_t *mc_ptr; /* multi-core specific data */ char *mem_bind; /* binding map for map/mask_cpu */ uint16_t mem_bind_type; /* see mem_bind_type_t */ + uint32_t min_cpus; /* minimum number of cpus */ uint32_t min_nodes; /* minimum number of nodes */ uint16_t nice; /* requested priority change, * NICE_OFFSET == no change */ @@ -427,7 +429,6 @@ struct job_record { char *nodes_completing; /* nodes still in completing state * for this job, used to insure * epilog is not re-run for job */ - uint32_t num_procs; /* count of required processors */ uint16_t other_port; /* port for client communications */ char *partition; /* name of the partition */ struct part_record *part_ptr; /* pointer to the partition record */ @@ -467,7 +468,7 @@ struct job_record { uint32_t time_limit; /* time_limit minutes or INFINITE, * NO_VAL implies partition max_time */ time_t tot_sus_time; /* total time in suspend state */ - uint32_t total_procs; /* number of allocated processors, + uint32_t total_cpus; /* number of allocated cpus, * for accounting */ uint32_t user_id; /* user the job runs as */ uint16_t warn_signal; /* signal to send before end_time */ diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index a97f579b825d0588ab225ac1bb5e475180401634..cc77338d90a198959a9193427acd8722b98b8862 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1007,10 +1007,11 @@ extern void step_alloc_lps(struct step_record *step_ptr) /* "scontrol reconfig" of live system */ pick_step_cores = false; } else if ((step_ptr->exclusive == 0) || - (step_ptr->cpu_count == job_ptr->total_procs)) { + (step_ptr->cpu_count == job_ptr->total_cpus)) { /* Step uses all of job's cores * Just copy the bitmap to save time */ - step_ptr->core_bitmap_job = bit_copy(job_resrcs_ptr->core_bitmap); + step_ptr->core_bitmap_job = bit_copy( + job_resrcs_ptr->core_bitmap); pick_step_cores = false; } #endif @@ -1595,7 +1596,10 @@ static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer) task_cnt = step_ptr->step_layout->task_cnt; node_list = step_ptr->step_layout->node_list; } else { - task_cnt = step_ptr->job_ptr->num_procs; + if(step_ptr->job_ptr->details) + task_cnt = step_ptr->job_ptr->details->min_cpus; + else + task_cnt = step_ptr->job_ptr->cpu_cnt; node_list = step_ptr->job_ptr->nodes; } pack32(step_ptr->job_ptr->job_id, buffer); @@ -1603,10 +1607,12 @@ static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer) pack16(step_ptr->ckpt_interval, buffer); pack32(step_ptr->job_ptr->user_id, buffer); #ifdef HAVE_BG - if (step_ptr->job_ptr->total_procs) - pack32(step_ptr->job_ptr->total_procs, buffer); + if (step_ptr->job_ptr->total_cpus) + pack32(step_ptr->job_ptr->total_cpus, buffer); + else if(step_ptr->job_ptr->details) + pack32(step_ptr->job_ptr->details->min_cpus, buffer); else - pack32(step_ptr->job_ptr->num_procs, buffer); + pack32(step_ptr->job_ptr->cpu_cnt, buffer); #else pack32(step_ptr->cpu_count, buffer); #endif diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index 3c64e3d6a853f637bc8b9d521f04bd44048c735f..65b03e32065d23254981a0b49ac89bfbb64d4a34 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -70,7 +70,7 @@ static int _archive_dump(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); static int _archive_load(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); -static int _cluster_procs(slurmdbd_conn_t *slurmdbd_conn, +static int _cluster_cpus(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); static int _get_accounts(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); @@ -223,8 +223,8 @@ proc_req(slurmdbd_conn_t *slurmdbd_conn, rc = _archive_load(slurmdbd_conn, in_buffer, out_buffer, uid); break; - case DBD_CLUSTER_PROCS: - rc = _cluster_procs(slurmdbd_conn, + case DBD_CLUSTER_CPUS: + rc = _cluster_cpus(slurmdbd_conn, in_buffer, out_buffer, uid); break; case DBD_GET_ACCOUNTS: @@ -950,42 +950,42 @@ end_it: return rc; } -static int _cluster_procs(slurmdbd_conn_t *slurmdbd_conn, +static int _cluster_cpus(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid) { - dbd_cluster_procs_msg_t *cluster_procs_msg = NULL; + dbd_cluster_cpus_msg_t *cluster_cpus_msg = NULL; int rc = SLURM_SUCCESS; char *comment = NULL; if ((*uid != slurmdbd_conf->slurm_user_id && *uid != 0)) { - comment = "DBD_CLUSTER_PROCS message from invalid uid"; - error("DBD_CLUSTER_PROCS message from invalid uid %u", *uid); + comment = "DBD_CLUSTER_CPUS message from invalid uid"; + error("DBD_CLUSTER_CPUS message from invalid uid %u", *uid); rc = ESLURM_ACCESS_DENIED; goto end_it; } - if (slurmdbd_unpack_cluster_procs_msg(slurmdbd_conn->rpc_version, - &cluster_procs_msg, in_buffer) != + if (slurmdbd_unpack_cluster_cpus_msg(slurmdbd_conn->rpc_version, + &cluster_cpus_msg, in_buffer) != SLURM_SUCCESS) { - comment = "Failed to unpack DBD_CLUSTER_PROCS message"; + comment = "Failed to unpack DBD_CLUSTER_CPUS message"; error("%s", comment); rc = SLURM_ERROR; goto end_it; } - debug2("DBD_CLUSTER_PROCS: called for %s(%u)", - cluster_procs_msg->cluster_name, - cluster_procs_msg->proc_count); + debug2("DBD_CLUSTER_CPUS: called for %s(%u)", + cluster_cpus_msg->cluster_name, + cluster_cpus_msg->cpu_count); - rc = clusteracct_storage_g_cluster_procs( + rc = clusteracct_storage_g_cluster_cpus( slurmdbd_conn->db_conn, - cluster_procs_msg->cluster_name, - cluster_procs_msg->cluster_nodes, - cluster_procs_msg->proc_count, - cluster_procs_msg->event_time); + cluster_cpus_msg->cluster_name, + cluster_cpus_msg->cluster_nodes, + cluster_cpus_msg->cpu_count, + cluster_cpus_msg->event_time); end_it: - slurmdbd_free_cluster_procs_msg(slurmdbd_conn->rpc_version, - cluster_procs_msg); + slurmdbd_free_cluster_cpus_msg(slurmdbd_conn->rpc_version, + cluster_cpus_msg); *out_buffer = make_dbd_rc_msg(slurmdbd_conn->rpc_version, - rc, comment, DBD_CLUSTER_PROCS); + rc, comment, DBD_CLUSTER_CPUS); return rc; } @@ -1642,7 +1642,7 @@ static int _get_reservations(slurmdbd_conn_t *slurmdbd_conn, static int _flush_jobs(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid) { - dbd_cluster_procs_msg_t *cluster_procs_msg = NULL; + dbd_cluster_cpus_msg_t *cluster_cpus_msg = NULL; int rc = SLURM_SUCCESS; char *comment = NULL; @@ -1652,8 +1652,8 @@ static int _flush_jobs(slurmdbd_conn_t *slurmdbd_conn, rc = ESLURM_ACCESS_DENIED; goto end_it; } - if (slurmdbd_unpack_cluster_procs_msg(slurmdbd_conn->rpc_version, - &cluster_procs_msg, in_buffer) != + if (slurmdbd_unpack_cluster_cpus_msg(slurmdbd_conn->rpc_version, + &cluster_cpus_msg, in_buffer) != SLURM_SUCCESS) { comment = "Failed to unpack DBD_FLUSH_JOBS message"; error("%s", comment); @@ -1661,15 +1661,15 @@ static int _flush_jobs(slurmdbd_conn_t *slurmdbd_conn, goto end_it; } debug2("DBD_FLUSH_JOBS: called for %s", - cluster_procs_msg->cluster_name); + cluster_cpus_msg->cluster_name); rc = acct_storage_g_flush_jobs_on_cluster( slurmdbd_conn->db_conn, - cluster_procs_msg->cluster_name, - cluster_procs_msg->event_time); + cluster_cpus_msg->cluster_name, + cluster_cpus_msg->event_time); end_it: - slurmdbd_free_cluster_procs_msg(slurmdbd_conn->rpc_version, - cluster_procs_msg); + slurmdbd_free_cluster_cpus_msg(slurmdbd_conn->rpc_version, + cluster_cpus_msg); *out_buffer = make_dbd_rc_msg(slurmdbd_conn->rpc_version, rc, comment, DBD_FLUSH_JOBS); return rc; @@ -1839,7 +1839,7 @@ static int _job_start(slurmdbd_conn_t *slurmdbd_conn, memset(&details, 0, sizeof(struct job_details)); memset(&id_rc_msg, 0, sizeof(dbd_id_rc_msg_t)); - job.total_procs = job_start_msg->alloc_cpus; + job.total_cpus = job_start_msg->alloc_cpus; job.node_cnt = job_start_msg->alloc_nodes; job.account = _replace_double_quotes(job_start_msg->account); job.assoc_id = job_start_msg->assoc_id; @@ -1854,7 +1854,7 @@ static int _job_start(slurmdbd_conn_t *slurmdbd_conn, job.nodes = job_start_msg->nodes; job.network = job_start_msg->node_inx; job.partition = job_start_msg->partition; - job.num_procs = job_start_msg->req_cpus; + details.min_cpus = job_start_msg->req_cpus; job.resv_id = job_start_msg->resv_id; job.priority = job_start_msg->priority; job.start_time = job_start_msg->start_time; @@ -3234,7 +3234,7 @@ static int _step_complete(slurmdbd_conn_t *slurmdbd_conn, job.start_time = step_comp_msg->start_time; details.submit_time = step_comp_msg->job_submit_time; step.step_id = step_comp_msg->step_id; - step.cpu_count = step_comp_msg->total_procs; + step.cpu_count = step_comp_msg->total_cpus; details.num_tasks = step_comp_msg->total_tasks; job.details = &details; @@ -3298,7 +3298,7 @@ static int _step_start(slurmdbd_conn_t *slurmdbd_conn, step.start_time = step_start_msg->start_time; details.submit_time = step_start_msg->job_submit_time; step.step_id = step_start_msg->step_id; - step.cpu_count = step_start_msg->total_procs; + step.cpu_count = step_start_msg->total_cpus; details.num_tasks = step_start_msg->total_tasks; layout.node_cnt = step_start_msg->node_cnt; diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index d502247eb9f5fcc75e1f793d5177521a1048b7e3..b69354c8f2214f899dcc1ddf1118707fdc714e48 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -44,7 +44,7 @@ #include "src/smap/smap.h" static int _get_node_cnt(job_info_t * job); -static int _max_procs_per_node(void); +static int _max_cpus_per_node(void); static int _nodes_in_list(char *node_list); static void _print_header_job(void); static int _print_text_job(job_info_t * job_ptr); @@ -136,7 +136,7 @@ extern void get_job(void) if((count>=text_line_cnt) && (printed_jobs < (text_win->_maxy-3))) { - job_ptr->num_procs = + job_ptr->num_cpus = (int)letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); @@ -146,7 +146,7 @@ extern void get_job(void) printed_jobs++; } } else { - job_ptr->num_procs = (int)letters[count%62]; + job_ptr->num_cpus = (int)letters[count%62]; _print_text_job(job_ptr); } count++; @@ -167,7 +167,7 @@ extern void get_job(void) < (text_win->_maxy-3))) { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); - job_ptr->num_procs = (int) letters[count%62]; + job_ptr->num_cpus = (int) letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); _print_text_job(job_ptr); @@ -178,7 +178,7 @@ extern void get_job(void) } else { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); - job_ptr->num_procs = (int) letters[count%62]; + job_ptr->num_cpus = (int) letters[count%62]; _print_text_job(job_ptr); printed_jobs++; } @@ -298,7 +298,7 @@ static int _print_text_job(job_info_t * job_ptr) #endif if(!params.commandline) { mvwprintw(text_win, main_ycord, - main_xcord, "%c", job_ptr->num_procs); + main_xcord, "%c", job_ptr->num_cpus); main_xcord += 3; mvwprintw(text_win, main_ycord, main_xcord, "%d", job_ptr->job_id); @@ -309,10 +309,11 @@ static int _print_text_job(job_info_t * job_ptr) #ifdef HAVE_BG mvwprintw(text_win, main_ycord, main_xcord, "%.16s", - select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, - time_buf, - sizeof(time_buf), - SELECT_PRINT_BG_ID)); + select_g_select_jobinfo_sprint( + job_ptr->select_jobinfo, + time_buf, + sizeof(time_buf), + SELECT_PRINT_BG_ID)); main_xcord += 18; #endif #ifdef HAVE_CRAY_XT @@ -440,16 +441,16 @@ static int _get_node_cnt(job_info_t * job) int node_cnt = 0, round; bool completing = job->job_state & JOB_COMPLETING; uint16_t base_job_state = job->job_state & (~JOB_COMPLETING); - static int max_procs = 0; + static int max_cpus = 0; if (base_job_state == JOB_PENDING || completing) { - if (max_procs == 0) - max_procs = _max_procs_per_node(); + if (max_cpus == 0) + max_cpus = _max_cpus_per_node(); node_cnt = _nodes_in_list(job->req_nodes); node_cnt = MAX(node_cnt, job->num_nodes); - round = job->num_procs + max_procs - 1; - round /= max_procs; /* round up */ + round = job->num_cpus + max_cpus - 1; + round /= max_cpus; /* round up */ node_cnt = MAX(node_cnt, round); } else node_cnt = _nodes_in_list(job->nodes); @@ -465,9 +466,9 @@ static int _nodes_in_list(char *node_list) } /* Return the maximum number of processors for any node in the cluster */ -static int _max_procs_per_node(void) +static int _max_cpus_per_node(void) { - int error_code, max_procs = 1; + int error_code, max_cpus = 1; node_info_msg_t *node_info_ptr = NULL; error_code = slurm_load_node ((time_t) NULL, &node_info_ptr, @@ -476,11 +477,11 @@ static int _max_procs_per_node(void) int i; node_info_t *node_ptr = node_info_ptr->node_array; for (i=0; i<node_info_ptr->record_count; i++) { - max_procs = MAX(max_procs, node_ptr[i].cpus); + max_cpus = MAX(max_cpus, node_ptr[i].cpus); } slurm_free_node_info_msg (node_info_ptr); } - return max_procs; + return max_cpus; } diff --git a/src/squeue/opts.c b/src/squeue/opts.c index 653d84e09dc17dea464aa5efd59b525234a78244..f611aa798ef6d846f1f6a6b7f44acdfca88c9ed8 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -80,7 +80,7 @@ static List _build_step_list( char* str ); static List _build_user_list( char* str ); static char *_get_prefix(char *token); static void _help( void ); -static int _max_procs_per_node(void); +static int _max_cpus_per_node(void); static int _parse_state( char* str, enum job_states* states ); static void _parse_token( char *token, char *field, int *field_size, bool *right_justify, char **suffix); @@ -341,16 +341,16 @@ parse_command_line( int argc, char* argv[] ) } } - params.max_procs = _max_procs_per_node(); + params.max_cpus = _max_cpus_per_node(); if ( params.verbose ) _print_options(); } /* Return the maximum number of processors for any node in the cluster */ -static int _max_procs_per_node(void) +static int _max_cpus_per_node(void) { - int error_code, max_procs = 1; + int error_code, max_cpus = 1; node_info_msg_t *node_info_ptr = NULL; error_code = slurm_load_node ((time_t) NULL, &node_info_ptr, @@ -359,12 +359,12 @@ static int _max_procs_per_node(void) int i; node_info_t *node_ptr = node_info_ptr->node_array; for (i=0; i<node_info_ptr->record_count; i++) { - max_procs = MAX(max_procs, node_ptr[i].cpus); + max_cpus = MAX(max_cpus, node_ptr[i].cpus); } slurm_free_node_info_msg (node_info_ptr); } - return max_procs; + return max_cpus; } /* @@ -515,15 +515,15 @@ extern int parse_format( char* format ) right_justify, suffix ); else if (field[0] == 'c') - job_format_add_min_procs( params.format_list, - field_size, - right_justify, - suffix ); + job_format_add_min_cpus( params.format_list, + field_size, + right_justify, + suffix ); else if (field[0] == 'C') - job_format_add_num_procs( params.format_list, - field_size, - right_justify, - suffix ); + job_format_add_num_cpus( params.format_list, + field_size, + right_justify, + suffix ); else if (field[0] == 'd') job_format_add_min_tmp_disk( params.format_list, @@ -802,7 +802,7 @@ _print_options() printf( "iterate = %d\n", params.iterate ); printf( "job_flag = %d\n", params.job_flag ); printf( "jobs = %s\n", params.jobs ); - printf( "max_procs = %d\n", params.max_procs ) ; + printf( "max_cpus = %d\n", params.max_cpus ) ; printf( "nodes = %s\n", hostlist ) ; printf( "partitions = %s\n", params.partitions ) ; printf( "sort = %s\n", params.sort ) ; diff --git a/src/squeue/print.c b/src/squeue/print.c index 7b5465bebe7045bbfb8b2d4b9270acae69f78136..e4a3357af8cd9b8bb8c37597311903999e9c798e 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -647,17 +647,17 @@ int _print_job_node_inx(job_info_t * job, int width, bool right, char* suffix) return SLURM_SUCCESS; } -int _print_job_num_procs(job_info_t * job, int width, bool right, char* suffix) +int _print_job_num_cpus(job_info_t * job, int width, bool right, char* suffix) { char tmp_char[18]; if (job == NULL) /* Print the Header instead */ _print_str("CPUS", width, right, true); else { #ifdef HAVE_BG - convert_num_unit((float)job->num_procs, tmp_char, + convert_num_unit((float)job->num_cpus, tmp_char, sizeof(tmp_char), UNIT_NONE); #else - snprintf(tmp_char, sizeof(tmp_char), "%u", job->num_procs); + snprintf(tmp_char, sizeof(tmp_char), "%u", job->num_cpus); #endif _print_str(tmp_char, width, right, true); } @@ -705,8 +705,8 @@ static int _get_node_cnt(job_info_t * job) if (base_job_state == JOB_PENDING || completing) { node_cnt = _nodes_in_list(job->req_nodes); node_cnt = MAX(node_cnt, job->num_nodes); - round = job->num_procs + params.max_procs - 1; - round /= params.max_procs; /* round up */ + round = job->num_cpus + params.max_cpus - 1; + round /= params.max_cpus; /* round up */ node_cnt = MAX(node_cnt, round); } else node_cnt = _nodes_in_list(job->nodes); @@ -778,8 +778,8 @@ int _print_job_contiguous(job_info_t * job, int width, bool right_justify, return SLURM_SUCCESS; } -int _print_job_min_procs(job_info_t * job, int width, bool right_justify, - char* suffix) +int _print_job_min_cpus(job_info_t * job, int width, bool right_justify, + char* suffix) { char tmp_char[8]; diff --git a/src/squeue/print.h b/src/squeue/print.h index 131b9417489d75d84283bef37bab5d7e7b6556cd..485ba5858294a4ddfdb5bf7c139712345b0fd37a 100644 --- a/src/squeue/print.h +++ b/src/squeue/print.h @@ -128,8 +128,8 @@ int job_format_add_function(List list, int width, bool right_justify, job_format_add_function(list,wid,right,suffix,_print_job_nodes) #define job_format_add_node_inx(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_node_inx) -#define job_format_add_num_procs(list,wid,right,suffix) \ - job_format_add_function(list,wid,right,suffix,_print_job_num_procs) +#define job_format_add_num_cpus(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix,_print_job_num_cpus) #define job_format_add_num_nodes(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_num_nodes) #define job_format_add_num_sct(list,wid,right,suffix) \ @@ -138,8 +138,8 @@ int job_format_add_function(List list, int width, bool right_justify, job_format_add_function(list,wid,right,suffix,_print_job_shared) #define job_format_add_contiguous(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_contiguous) -#define job_format_add_min_procs(list,wid,right,suffix) \ - job_format_add_function(list,wid,right,suffix,_print_job_min_procs) +#define job_format_add_min_cpus(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix,_print_job_min_cpus) #define job_format_add_min_sockets(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_min_sockets) #define job_format_add_min_cores(list,wid,right,suffix) \ @@ -222,7 +222,7 @@ int _print_job_node_inx(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_partition(job_info_t * job, int width, bool right_justify, char* suffix); -int _print_job_num_procs(job_info_t * job, int width, bool right_justify, +int _print_job_num_cpus(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_num_nodes(job_info_t * job, int width, bool right_justify, char* suffix); @@ -232,7 +232,7 @@ int _print_job_shared(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_contiguous(job_info_t * job, int width, bool right_justify, char* suffix); -int _print_job_min_procs(job_info_t * job, int width, bool right_justify, +int _print_job_min_cpus(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_min_sockets(job_info_t * job, int width, bool right_justify, char* suffix); diff --git a/src/squeue/sort.c b/src/squeue/sort.c index 1c7e0c0ebd234b31674b5ea2f608c31bdb8e4ab4..b8363c4b299b5ea47bfe6bd4a297b22122368385 100644 --- a/src/squeue/sort.c +++ b/src/squeue/sort.c @@ -64,7 +64,7 @@ static int _sort_job_by_time_start(void *void1, void *void2); static int _sort_job_by_time_used(void *void1, void *void2); static int _sort_job_by_node_list(void *void1, void *void2); static int _sort_job_by_num_nodes(void *void1, void *void2); -static int _sort_job_by_num_procs(void *void1, void *void2); +static int _sort_job_by_num_cpus(void *void1, void *void2); static int _sort_job_by_num_sct(void *void1, void *void2); static int _sort_job_by_min_sockets(void *void1, void *void2); static int _sort_job_by_min_cores(void *void1, void *void2); @@ -108,7 +108,7 @@ void sort_job_list(List job_list) if (params.sort[i] == 'c') ; /* sort_job_by_min_cpus_per_node */ else if (params.sort[i] == 'C') - list_sort(job_list, _sort_job_by_num_procs); + list_sort(job_list, _sort_job_by_num_cpus); else if (params.sort[i] == 'd') list_sort(job_list, _sort_job_by_min_tmp_disk); else if (params.sort[i] == 'D') @@ -342,13 +342,13 @@ static int _sort_job_by_num_nodes(void *void1, void *void2) return diff; } -static int _sort_job_by_num_procs(void *void1, void *void2) +static int _sort_job_by_num_cpus(void *void1, void *void2) { int diff; job_info_t *job1 = (job_info_t *) void1; job_info_t *job2 = (job_info_t *) void2; - diff = job1->num_procs - job2->num_procs; + diff = job1->num_cpus - job2->num_cpus; if (reverse_order) diff = -diff; diff --git a/src/squeue/squeue.h b/src/squeue/squeue.h index e43229765af047d1822fdc3f9c8fb145edf905e7..2cad21cad924f5f54d98101af69ba7d2b6b0ee7a 100644 --- a/src/squeue/squeue.h +++ b/src/squeue/squeue.h @@ -83,7 +83,7 @@ struct squeue_parameters { bool long_list; bool no_header; int iterate; - int max_procs; + int max_cpus; int verbose; char* accounts; diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 5637ca969b04a5d7b0cbeb16228c232ae8504e56..b32b6820c2707badb326302b9ab83dcb90c30e24 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -656,12 +656,12 @@ job_desc_msg_create_from_opts () if (opt.job_min_tmp_disk != NO_VAL) j->job_min_tmp_disk = opt.job_min_tmp_disk; if (opt.overcommit) { - j->num_procs = opt.min_nodes; - j->overcommit = opt.overcommit; + j->min_cpus = opt.min_nodes; + j->overcommit = opt.overcommit; } else - j->num_procs = opt.nprocs * opt.cpus_per_task; + j->min_cpus = opt.nprocs * opt.cpus_per_task; if (opt.nprocs_set) - j->num_tasks = opt.nprocs; + j->num_tasks = opt.nprocs; if (opt.cpus_set) j->cpus_per_task = opt.cpus_per_task; diff --git a/src/sview/job_info.c b/src/sview/job_info.c index c6b01812e8f8a5b7024d26f74abfed8b2d0334d5..0df33a57ecb23aa9546318a3ba82c0dcd8d4db7d 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -86,10 +86,9 @@ enum { SORTID_CONTIGUOUS, /* SORTID_CORES_MAX, */ /* SORTID_CORES_MIN, */ - SORTID_CPU_NUM, -#ifdef HAVE_BG + SORTID_CPUS, SORTID_CPU_MAX, -#endif + SORTID_CPU_MIN, SORTID_CPUS_PER_TASK, SORTID_DEPENDENCY, SORTID_EXIT_CODE, @@ -244,6 +243,8 @@ static display_data_t display_data_job[] = { EDIT_TEXTBOX, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_NODES, "Node Count", TRUE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, + {G_TYPE_STRING, SORTID_CPUS, "CPU Count", + FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, #ifdef HAVE_BG {G_TYPE_STRING, SORTID_NODELIST, "BP List", TRUE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, @@ -271,12 +272,10 @@ static display_data_t display_data_job[] = { EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_BATCH, "Batch Flag", FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, -#ifdef HAVE_BG + {G_TYPE_STRING, SORTID_CPU_MIN, "CPUs Min", + FALSE, EDIT_TEXTBOX, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_CPU_MAX, "CPUs Max", - FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, -#endif - {G_TYPE_STRING, SORTID_CPU_NUM, "CPU Count", - FALSE, EDIT_NONE, refresh_job, create_model_job, admin_edit_job}, + FALSE, EDIT_TEXTBOX, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_TASKS, "Task Count", FALSE, EDIT_TEXTBOX, refresh_job, create_model_job, admin_edit_job}, {G_TYPE_STRING, SORTID_SHARED, "Shared", FALSE, @@ -1215,26 +1214,39 @@ static void _layout_job_record(GtkTreeView *treeview, tmp_char); #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_procs, tmp_char, sizeof(tmp_char), + convert_num_unit((float)job_ptr->num_cpus, tmp_char, sizeof(tmp_char), UNIT_NONE); #else - snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_procs); + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_cpus); #endif add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, - SORTID_CPU_NUM), + SORTID_CPUS), tmp_char); #ifdef HAVE_BG + select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, + tmp_char, + sizeof(tmp_char), + SELECT_PRINT_MAX_CPUS); +#else + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->max_cpus); +#endif add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_CPU_MAX), - select_g_select_jobinfo_sprint( - job_ptr->select_jobinfo, - tmp_char, - sizeof(tmp_char), - SELECT_PRINT_MAX_CPUS)); + tmp_char); + +#ifdef HAVE_BG + convert_num_unit((float)job_ptr->num_cpus, tmp_char, sizeof(tmp_char), + UNIT_NONE); +#else + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_cpus); #endif + add_display_treestore_line(update, treestore, &iter, + find_col_name(display_data_job, + SORTID_CPU_MIN), + tmp_char); if(job_ptr->cpus_per_task > 0) sprintf(tmp_char, "%u", job_ptr->cpus_per_task); @@ -1695,13 +1707,6 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, tmp_char, sizeof(tmp_char), SELECT_PRINT_ROTATE), -1); - gtk_tree_store_set(treestore, iter, - SORTID_CPU_MAX, - select_g_select_jobinfo_sprint( - job_ptr->select_jobinfo, - tmp_char, - sizeof(tmp_char), - SELECT_PRINT_MAX_CPUS), -1); #ifdef HAVE_BGL gtk_tree_store_set(treestore, iter, SORTID_IMAGE_BLRTS, @@ -1764,13 +1769,33 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, SORTID_NODES, tmp_char, -1); #ifdef HAVE_BG - convert_num_unit((float)job_ptr->num_procs, tmp_char, sizeof(tmp_char), + convert_num_unit((float)job_ptr->num_cpus, tmp_char, sizeof(tmp_char), UNIT_NONE); #else - snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_procs); + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_cpus); +#endif + gtk_tree_store_set(treestore, iter, + SORTID_CPUS, tmp_char, -1); + +#ifdef HAVE_BG + convert_num_unit((float)job_ptr->num_cpus, tmp_char, sizeof(tmp_char), + UNIT_NONE); +#else + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->num_cpus); +#endif + gtk_tree_store_set(treestore, iter, + SORTID_CPU_MIN, tmp_char, -1); + +#ifdef HAVE_BG + select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, + tmp_char, + sizeof(tmp_char), + SELECT_PRINT_MAX_CPUS); +#else + snprintf(tmp_char, sizeof(tmp_char), "%u", job_ptr->max_cpus); #endif gtk_tree_store_set(treestore, iter, - SORTID_CPU_NUM, tmp_char, -1); + SORTID_CPU_MAX, tmp_char, -1); gtk_tree_store_set(treestore, iter, SORTID_NODELIST, nodes, -1); @@ -1936,7 +1961,7 @@ static void _layout_step_record(GtkTreeView *treeview, UNIT_NONE); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, - SORTID_CPU_NUM), + SORTID_CPU_MIN), tmp_char); uname = uid_to_string((uid_t)step_ptr->user_id); @@ -2089,7 +2114,7 @@ static void _update_step_record(job_step_info_t *step_ptr, convert_num_unit((float)step_ptr->num_cpus, tmp_char, sizeof(tmp_char), UNIT_NONE); gtk_tree_store_set(treestore, iter, - SORTID_CPU_NUM, tmp_char, -1); + SORTID_CPU_MIN, tmp_char, -1); gtk_tree_store_set(treestore, iter, SORTID_NODELIST, nodes, -1);