diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index b9d7b5c7ca637c61295b4586ac41257cc21c4972..e916ec3082961149898b4d9fe8cedf0f4f25f3cc 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1330,9 +1330,9 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_GRP_CPU: return "QOSGrpCpuLimit"; case WAIT_QOS_GRP_CPU_MIN: - return "QOSGrpCPUMinsLimit"; + return "QOSGrpCPUMinutesLimit"; case WAIT_QOS_GRP_CPU_RUN_MIN: - return "QOSGrpCPURunMinsLimit"; + return "QOSGrpCPURunMinutesLimit"; case WAIT_QOS_GRP_JOB: return"QOSGrpJobsLimit"; case WAIT_QOS_GRP_MEM: @@ -1346,7 +1346,7 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_CPU_PER_JOB: return "QOSMaxCpuPerJobLimit"; case WAIT_QOS_MAX_CPU_MINS_PER_JOB: - return "QOSMaxCpuMinsPerJobLimit"; + return "QOSMaxCpuMinutesPerJobLimit"; case WAIT_QOS_MAX_NODE_PER_JOB: return "QOSMaxNodePerJobLimit"; case WAIT_QOS_MAX_WALL_PER_JOB: @@ -1364,9 +1364,9 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_ASSOC_GRP_CPU: return "AssocGrpCpuLimit"; case WAIT_ASSOC_GRP_CPU_MIN: - return "AssocGrpCPUMinsLimit"; + return "AssocGrpCPUMinutesLimit"; case WAIT_ASSOC_GRP_CPU_RUN_MIN: - return "AssocGrpCPURunMinsLimit"; + return "AssocGrpCPURunMinutesLimit"; case WAIT_ASSOC_GRP_JOB: return"AssocGrpJobsLimit"; case WAIT_ASSOC_GRP_MEM: @@ -1382,7 +1382,7 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_ASSOC_MAX_CPU_PER_JOB: return "AssocMaxCpuPerJobLimit"; case WAIT_ASSOC_MAX_CPU_MINS_PER_JOB: - return "AssocMaxCpuMinsPerJobLimit"; + return "AssocMaxCpuMinutesPerJobLimit"; case WAIT_ASSOC_MAX_NODE_PER_JOB: return "AssocMaxNodePerJobLimit"; case WAIT_ASSOC_MAX_WALL_PER_JOB: @@ -1406,86 +1406,86 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_ASSOC_GRP_UNK: return "AssocGrpUnknown"; case WAIT_ASSOC_GRP_UNK_MIN: - return "AssocGrpUnknownMin"; + return "AssocGrpUnknownMinutes"; case WAIT_ASSOC_GRP_UNK_RUN_MIN: - return "AssocGrpUnknownRunMin"; + return "AssocGrpUnknownRunMinutes"; case WAIT_ASSOC_MAX_UNK_PER_JOB: return "AssocMaxUnknownPerJob"; case WAIT_ASSOC_MAX_UNK_PER_NODE: return "AssocMaxUnknownPerNode"; case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: - return "AssocMaxUnknownMinPerJob"; + return "AssocMaxUnknownMinutesPerJob"; case WAIT_ASSOC_MAX_CPU_PER_NODE: return "AssocMaxCpuPerNode"; case WAIT_ASSOC_GRP_MEM_MIN: - return "AssocGrpMemMin"; + return "AssocGrpMemMinutes"; case WAIT_ASSOC_GRP_MEM_RUN_MIN: - return "AssocGrpMemRunMin"; + return "AssocGrpMemRunMinutes"; case WAIT_ASSOC_MAX_MEM_PER_JOB: return "AssocMaxMemPerJob"; case WAIT_ASSOC_MAX_MEM_PER_NODE: return "AssocMaxMemPerNode"; case WAIT_ASSOC_MAX_MEM_MINS_PER_JOB: - return "AssocMaxMemMinPerJob"; + return "AssocMaxMemMinutesPerJob"; case WAIT_ASSOC_GRP_NODE_MIN: - return "AssocGrpNodeMin"; + return "AssocGrpNodeMinutes"; case WAIT_ASSOC_GRP_NODE_RUN_MIN: - return "AssocGrpNodeRunMin"; + return "AssocGrpNodeRunMinutes"; case WAIT_ASSOC_MAX_NODE_MINS_PER_JOB: - return "AssocMaxNodeMinPerJob"; + return "AssocMaxNodeMinutesPerJob"; case WAIT_ASSOC_GRP_ENERGY: return "AssocGrpEnergy"; case WAIT_ASSOC_GRP_ENERGY_MIN: - return "AssocGrpEnergyMin"; + return "AssocGrpEnergyMinutes"; case WAIT_ASSOC_GRP_ENERGY_RUN_MIN: - return "AssocGrpEnergyRunMin"; + return "AssocGrpEnergyRunMinutes"; case WAIT_ASSOC_MAX_ENERGY_PER_JOB: return "AssocMaxEnergyPerJob"; case WAIT_ASSOC_MAX_ENERGY_PER_NODE: return "AssocMaxEnergyPerNode"; case WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB: - return "AssocMaxEnergyMinPerJob"; + return "AssocMaxEnergyMinutesPerJob"; case WAIT_ASSOC_GRP_GRES: return "AssocGrpGRES"; case WAIT_ASSOC_GRP_GRES_MIN: - return "AssocGrpGRESMin"; + return "AssocGrpGRESMinutes"; case WAIT_ASSOC_GRP_GRES_RUN_MIN: - return "AssocGrpGRESRunMin"; + return "AssocGrpGRESRunMinutes"; case WAIT_ASSOC_MAX_GRES_PER_JOB: return "AssocMaxGRESPerJob"; case WAIT_ASSOC_MAX_GRES_PER_NODE: return "AssocMaxGRESPerNode"; case WAIT_ASSOC_MAX_GRES_MINS_PER_JOB: - return "AssocMaxGRESMinPerJob"; + return "AssocMaxGRESMinutesPerJob"; case WAIT_ASSOC_GRP_LIC: return "AssocGrpLicense"; case WAIT_ASSOC_GRP_LIC_MIN: - return "AssocGrpLicenseMin"; + return "AssocGrpLicenseMinutes"; case WAIT_ASSOC_GRP_LIC_RUN_MIN: - return "AssocGrpLicenseRunMin"; + return "AssocGrpLicenseRunMinutes"; case WAIT_ASSOC_MAX_LIC_PER_JOB: return "AssocMaxLicensePerJob"; case WAIT_ASSOC_MAX_LIC_MINS_PER_JOB: - return "AssocMaxLicenseMinPerJob"; + return "AssocMaxLicenseMinutesPerJob"; case WAIT_ASSOC_GRP_BB: return "AssocGrpBB"; case WAIT_ASSOC_GRP_BB_MIN: - return "AssocGrpBBMin"; + return "AssocGrpBBMinutes"; case WAIT_ASSOC_GRP_BB_RUN_MIN: - return "AssocGrpBBRunMin"; + return "AssocGrpBBRunMinutes"; case WAIT_ASSOC_MAX_BB_PER_JOB: return "AssocMaxBBPerJob"; case WAIT_ASSOC_MAX_BB_PER_NODE: return "AssocMaxBBPerNode"; case WAIT_ASSOC_MAX_BB_MINS_PER_JOB: - return "AssocMaxBBMinPerJob"; + return "AssocMaxBBMinutesPerJob"; case WAIT_QOS_GRP_UNK: return "QOSGrpUnknown"; case WAIT_QOS_GRP_UNK_MIN: - return "QOSGrpUnknownMin"; + return "QOSGrpUnknownMinutes"; case WAIT_QOS_GRP_UNK_RUN_MIN: - return "QOSGrpUnknownRunMin"; + return "QOSGrpUnknownRunMinutes"; case WAIT_QOS_MAX_UNK_PER_JOB: return "QOSMaxUnknownPerJob"; case WAIT_QOS_MAX_UNK_PER_NODE: @@ -1493,15 +1493,15 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_UNK_PER_USER: return "QOSMaxUnknownPerUser"; case WAIT_QOS_MAX_UNK_MINS_PER_JOB: - return "QOSMaxUnknownMinPerJob"; + return "QOSMaxUnknownMinutesPerJob"; case WAIT_QOS_MIN_UNK: return "QOSMinUnknown"; case WAIT_QOS_MAX_CPU_PER_NODE: return "QOSMaxCpuPerNode"; case WAIT_QOS_GRP_MEM_MIN: - return "QOSGrpMemoryMin"; + return "QOSGrpMemoryMinutes"; case WAIT_QOS_GRP_MEM_RUN_MIN: - return "QOSGrpMemoryRunMin"; + return "QOSGrpMemoryRunMinutes"; case WAIT_QOS_MAX_MEM_PER_JOB: return "QOSMaxMemoryPerJob"; case WAIT_QOS_MAX_MEM_PER_NODE: @@ -1509,23 +1509,23 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_MEM_PER_USER: return "QOSMaxMemoryPerUser"; case WAIT_QOS_MAX_MEM_MINS_PER_JOB: - return "QOSMaxMemoryMinPerJob"; + return "QOSMaxMemoryMinutesPerJob"; case WAIT_QOS_MIN_MEM: return "QOSMinMemory"; case WAIT_QOS_GRP_NODE_MIN: - return "QOSGrpNodeMin"; + return "QOSGrpNodeMinutes"; case WAIT_QOS_GRP_NODE_RUN_MIN: - return "QOSGrpNodeRunMin"; + return "QOSGrpNodeRunMinutes"; case WAIT_QOS_MAX_NODE_MINS_PER_JOB: - return "QOSMaxNodeMinPerJob"; + return "QOSMaxNodeMinutesPerJob"; case WAIT_QOS_MIN_NODE: return "QOSMinNode"; case WAIT_QOS_GRP_ENERGY: return "QOSGrpEnergy"; case WAIT_QOS_GRP_ENERGY_MIN: - return "QOSGrpEnergyMin"; + return "QOSGrpEnergyMinutes"; case WAIT_QOS_GRP_ENERGY_RUN_MIN: - return "QOSGrpEnergyRunMin"; + return "QOSGrpEnergyRunMinutes"; case WAIT_QOS_MAX_ENERGY_PER_JOB: return "QOSMaxEnergyPerJob"; case WAIT_QOS_MAX_ENERGY_PER_NODE: @@ -1533,15 +1533,15 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_ENERGY_PER_USER: return "QOSMaxEnergyPerUser"; case WAIT_QOS_MAX_ENERGY_MINS_PER_JOB: - return "QOSMaxEnergyMinPerJob"; + return "QOSMaxEnergyMinutesPerJob"; case WAIT_QOS_MIN_ENERGY: return "QOSMinEnergy"; case WAIT_QOS_GRP_GRES: return "QOSGrpGRES"; case WAIT_QOS_GRP_GRES_MIN: - return "QOSGrpGRESMin"; + return "QOSGrpGRESMinutes"; case WAIT_QOS_GRP_GRES_RUN_MIN: - return "QOSGrpGRESRunMin"; + return "QOSGrpGRESRunMinutes"; case WAIT_QOS_MAX_GRES_PER_JOB: return "QOSMaxGRESPerJob"; case WAIT_QOS_MAX_GRES_PER_NODE: @@ -1549,29 +1549,29 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_GRES_PER_USER: return "QOSMaxGRESPerUser"; case WAIT_QOS_MAX_GRES_MINS_PER_JOB: - return "QOSMaxGRESMinPerJob"; + return "QOSMaxGRESMinutesPerJob"; case WAIT_QOS_MIN_GRES: return "QOSMinGRES"; case WAIT_QOS_GRP_LIC: return "QOSGrpLicense"; case WAIT_QOS_GRP_LIC_MIN: - return "QOSGrpLicenseMin"; + return "QOSGrpLicenseMinutes"; case WAIT_QOS_GRP_LIC_RUN_MIN: - return "QOSGrpLicenseRunMin"; + return "QOSGrpLicenseRunMinutes"; case WAIT_QOS_MAX_LIC_PER_JOB: return "QOSMaxLicensePerJob"; case WAIT_QOS_MAX_LIC_PER_USER: return "QOSMaxLicensePerUser"; case WAIT_QOS_MAX_LIC_MINS_PER_JOB: - return "QOSMaxLicenseMinPerJob"; + return "QOSMaxLicenseMinutesPerJob"; case WAIT_QOS_MIN_LIC: return "QOSMinLicense"; case WAIT_QOS_GRP_BB: return "QOSGrpBB"; case WAIT_QOS_GRP_BB_MIN: - return "QOSGrpBBMin"; + return "QOSGrpBBMinutes"; case WAIT_QOS_GRP_BB_RUN_MIN: - return "QOSGrpBBRunMin"; + return "QOSGrpBBRunMinutes"; case WAIT_QOS_MAX_BB_PER_JOB: return "QOSMaxBBPerJob"; case WAIT_QOS_MAX_BB_PER_NODE: @@ -1579,7 +1579,7 @@ extern char *job_reason_string(enum job_state_reason inx) case WAIT_QOS_MAX_BB_PER_USER: return "QOSMaxBBPerUser"; case WAIT_QOS_MAX_BB_MINS_PER_JOB: - return "AssocMaxBBMinPerJob"; + return "AssocMaxBBMinutesPerJob"; case WAIT_QOS_MIN_BB: return "QOSMinBB"; default: diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c index d5cda4691f9fd3abd7748cddd235457374d6a951..288467f06689203e8223918b06178d87ce56fde0 100644 --- a/src/common/slurmdb_defs.c +++ b/src/common/slurmdb_defs.c @@ -551,6 +551,7 @@ extern void slurmdb_destroy_assoc_usage(void *object) FREE_NULL_BITMAP(usage->valid_qos); xfree(usage->grp_used_tres_run_secs); xfree(usage->grp_used_tres); + xfree(usage->usage_tres_raw); xfree(usage); } } @@ -565,6 +566,7 @@ extern void slurmdb_destroy_qos_usage(void *object) FREE_NULL_LIST(usage->user_limit_list); xfree(usage->grp_used_tres_run_secs); xfree(usage->grp_used_tres); + xfree(usage->usage_tres_raw); xfree(usage); } } diff --git a/src/plugins/burst_buffer/common/burst_buffer_common.c b/src/plugins/burst_buffer/common/burst_buffer_common.c index c511e2fc7adfa2d82bf059fe7c75571bbfaa32a6..4ed750aee3bf788b5b311417d81101496493fbb4 100644 --- a/src/plugins/burst_buffer/common/burst_buffer_common.c +++ b/src/plugins/burst_buffer/common/burst_buffer_common.c @@ -422,7 +422,7 @@ static uint64_t _atoi(char *tok) } #endif -/* Set the bb_state's tres_pos for limit enforcement. +/* Set the bb_state's tres_id and tres_pos for limit enforcement. * Value is set to -1 if not found. */ extern void bb_set_tres_pos(bb_state_t *state_ptr) { @@ -438,7 +438,8 @@ extern void bb_set_tres_pos(bb_state_t *state_ptr) debug("%s: Tres %s not found by assoc_mgr", __func__, state_ptr->name); } else { - state_ptr->tres_pos = assoc_mgr_tres_array[inx]->id; + state_ptr->tres_id = assoc_mgr_tres_array[inx]->id; + state_ptr->tres_pos = inx; } } @@ -1529,9 +1530,10 @@ extern int bb_post_persist_create(bb_alloc_t *bb_alloc, bb_state_t *state_ptr) resv.assocs = bb_alloc->assocs; resv.cluster = slurmctld_cluster_name; resv.name = bb_alloc->name; + resv.id = bb_alloc->id; resv.time_start = bb_alloc->create_time; - xstrfmtcat(resv.tres_str, "bb/%s", state_ptr->name); - + xstrfmtcat(resv.tres_str, "%d=%"PRIu64, + state_ptr->tres_id, bb_alloc->size / (1024 * 1024)); rc = acct_storage_g_add_reservation(acct_db_conn, &resv); xfree(resv.tres_str); @@ -1548,11 +1550,13 @@ extern int bb_post_persist_delete(bb_alloc_t *bb_alloc, bb_state_t *state_ptr) resv.assocs = bb_alloc->assocs; resv.cluster = slurmctld_cluster_name; resv.name = bb_alloc->name; + resv.id = bb_alloc->id; resv.time_end = time(NULL); resv.time_start = bb_alloc->create_time; - xstrfmtcat(resv.tres_str, "bb/%s", state_ptr->name); + xstrfmtcat(resv.tres_str, "%d=%"PRIu64, + state_ptr->tres_id, bb_alloc->size / (1024 * 1024)); - rc = acct_storage_g_add_reservation(acct_db_conn, &resv); + rc = acct_storage_g_remove_reservation(acct_db_conn, &resv); xfree(resv.tres_str); return rc; diff --git a/src/plugins/burst_buffer/common/burst_buffer_common.h b/src/plugins/burst_buffer/common/burst_buffer_common.h index a1afc016005c3bd6afbc9022b0b7ff2e8965ab00..b1ae4b5c97578deedf4730ba678b09b284fe9938 100644 --- a/src/plugins/burst_buffer/common/burst_buffer_common.h +++ b/src/plugins/burst_buffer/common/burst_buffer_common.h @@ -94,9 +94,11 @@ typedef struct bb_alloc { uint32_t array_job_id; uint32_t array_task_id; bool cancelled; + time_t create_time; /* Time of creation */ time_t end_time; /* Expected time when use will end */ uint32_t gres_cnt; /* Count of records in gres_ptr */ burst_buffer_gres_t *gres_ptr; + uint32_t id; /* ID for reservation/accounting */ uint32_t job_id; uint32_t magic; char *name; /* For persistent burst buffers */ @@ -106,7 +108,6 @@ typedef struct bb_alloc { time_t seen_time; /* Time buffer last seen */ uint64_t size; uint16_t state; - time_t create_time; /* Time of creation */ time_t state_time; /* Time of last state change */ time_t use_time; /* Expected time when use will begin */ uint32_t user_id; @@ -203,6 +204,7 @@ typedef struct bb_state { bool term_flag; pthread_mutex_t term_mutex; uint64_t total_space; /* units are bytes */ + int tres_id; /* TRES ID, for limits */ int tres_pos; /* TRES index, for limits */ uint64_t used_space; /* units are bytes */ } bb_state_t; diff --git a/src/plugins/burst_buffer/cray/burst_buffer_cray.c b/src/plugins/burst_buffer/cray/burst_buffer_cray.c index 7c16cf17de519d99f561eda9c1af9ee9afeff170..344d9a506d0c93f58c7ad2e0a73c1ffafb24e8e9 100644 --- a/src/plugins/burst_buffer/cray/burst_buffer_cray.c +++ b/src/plugins/burst_buffer/cray/burst_buffer_cray.c @@ -100,6 +100,7 @@ const uint32_t plugin_version = SLURM_VERSION_NUMBER; /* Most state information is in a common structure so that we can more * easily use common functions from multiple burst buffer plugins */ static bb_state_t bb_state; +static uint32_t last_persistent_id = 1; static char * state_save_loc = NULL; /* Description of each Cray DW configuration entry @@ -622,6 +623,7 @@ static void _save_limits_state(void) if (bb_alloc->name) { packstr(bb_alloc->account, buffer); pack_time(bb_alloc->create_time,buffer); + pack32(bb_alloc->id, buffer); packstr(bb_alloc->name, buffer); packstr(bb_alloc->partition, buffer); packstr(bb_alloc->qos, buffer); @@ -730,7 +732,8 @@ static void _recover_limit_state(void) char *state_file = NULL, *data = NULL; int data_allocated, data_read = 0; uint16_t protocol_version = (uint16_t)NO_VAL; - uint32_t data_size = 0, rec_count = 0, name_len = 0, user_id = 0; + uint32_t data_size = 0, rec_count = 0, name_len = 0; + uint32_t id = 0, user_id = 0; uint64_t size; int i, state_fd; char *account = NULL, *name = NULL, *partition = NULL, *qos = NULL; @@ -779,6 +782,7 @@ static void _recover_limit_state(void) for (i = 0; i < rec_count; i++) { safe_unpackstr_xmalloc(&account, &name_len, buffer); safe_unpack_time(&create_time, buffer); + safe_unpack32(&id, buffer); safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&partition, &name_len, buffer); safe_unpackstr_xmalloc(&qos, &name_len, buffer); @@ -788,6 +792,8 @@ static void _recover_limit_state(void) if (bb_state.bb_config.flags & BB_FLAG_EMULATE_CRAY) { bb_alloc = bb_alloc_name_rec(&bb_state, name, user_id); + bb_alloc->id = id; + last_persistent_id = MAX(last_persistent_id, id); if (name && (name[0] >='0') && (name[0] <='9')) bb_alloc->job_id = strtol(name, &end_ptr, 10); bb_alloc->seen_time = time(NULL); @@ -2619,7 +2625,7 @@ static void _update_job_env(struct job_record *job_ptr, char *file_path) stat_buf.st_size = 2048; } else if (stat_buf.st_size == 0) goto fini; - data_buf = xmalloc(stat_buf.st_size); + data_buf = xmalloc(stat_buf.st_size + 1); while (inx < stat_buf.st_size) { read_size = read(path_fd, data_buf + inx, stat_buf.st_size); if (read_size > 0) { @@ -3446,7 +3452,7 @@ static void _reset_buf_state(uint32_t user_id, uint32_t job_id, char *name, bb_buf_t *buf_ptr; bb_job_t *bb_job; int i, old_state; - bool active_buf; + bool active_buf = false; bb_job = bb_job_find(&bb_state, job_id); if (!bb_job) { @@ -3600,6 +3606,12 @@ if (0) { //FIXME: Cray bug: API exit code NOT 0 on success as documented bb_alloc->qos = xstrdup(qos_ptr->name); } } +//FIXME: Read create time and ID as set in DW database here + if (bb_state.bb_config.flags & BB_FLAG_EMULATE_CRAY) { + bb_alloc->create_time = time(NULL); + bb_alloc->id = ++last_persistent_id; + } + (void) bb_post_persist_create(bb_alloc, &bb_state); pthread_mutex_unlock(&bb_state.bb_mutex); unlock_slurmctld(job_write_lock); } @@ -3685,7 +3697,8 @@ static void *_destroy_persistent(void *x) bb_alloc->partition, bb_alloc->qos, bb_alloc->size, &bb_state); (void) bb_free_alloc_rec(&bb_state, bb_alloc); - unlock_slurmctld(job_write_lock); + (void) bb_post_persist_delete(bb_alloc, &bb_state); + pthread_mutex_unlock(&bb_state.bb_mutex); } xfree(resp_msg); _free_create_args(destroy_args); @@ -4262,7 +4275,7 @@ extern char *bb_p_xlate_bb_2_tres_str(char *burst_buffer) char *result = NULL; uint64_t size, total = 0; - if (!burst_buffer || (bb_state.tres_pos < 1)) + if (!burst_buffer || (bb_state.tres_id < 1)) return result; tmp = xstrdup(burst_buffer); @@ -4287,7 +4300,7 @@ extern char *bb_p_xlate_bb_2_tres_str(char *burst_buffer) } if (total) - xstrfmtcat(result, "%d=%"PRIu64, bb_state.tres_pos, total); + xstrfmtcat(result, "%d=%"PRIu64, bb_state.tres_id, total); return result; }