From 0629201ade01009e82cea653dca342b840e4897c Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Wed, 12 Aug 2015 17:25:32 -0700 Subject: [PATCH] Added a bunch of new waiting codes dealing with TRES --- slurm/slurm.h.in | 158 +++++++++++++-- src/common/slurm_protocol_defs.c | 195 +++++++++++++++++-- src/slurmctld/acct_policy.c | 318 ++++++++++++++++++++++++++++--- 3 files changed, 602 insertions(+), 69 deletions(-) diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index eaa88700dcc..6befb6786e8 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -359,35 +359,36 @@ enum job_state_reason { WAIT_DEP_INVALID, /* Dependency condition invalid or never * satisfied */ - WAIT_QOS_GRP_CPU, /* QOS GrpCpus exceeded */ - WAIT_QOS_GRP_CPU_MIN, /* QOS GrpCPUMins exceeded */ - WAIT_QOS_GRP_CPU_RUN_MIN, /* QOS GrpCPURunMins exceeded */ + WAIT_QOS_GRP_CPU, /* QOS GrpTRES exceeded (CPU) */ + WAIT_QOS_GRP_CPU_MIN, /* QOS GrpTRESMins exceeded (CPU) */ + WAIT_QOS_GRP_CPU_RUN_MIN, /* QOS GrpTRESRunMins exceeded (CPU) */ WAIT_QOS_GRP_JOB, /* QOS GrpJobs exceeded */ - WAIT_QOS_GRP_MEMORY, /* QOS GrpMemory exceeded */ - WAIT_QOS_GRP_NODES, /* QOS GrpNodes exceeded */ + WAIT_QOS_GRP_MEM, /* QOS GrpTRES exceeded (Memory) */ + WAIT_QOS_GRP_NODE, /* QOS GrpTRES exceeded (Node) */ WAIT_QOS_GRP_SUB_JOB, /* QOS GrpSubmitJobs exceeded */ WAIT_QOS_GRP_WALL, /* QOS GrpWall exceeded */ - WAIT_QOS_MAX_CPUS_PER_JOB, /* QOS MaxCpusPerJob exceeded */ - WAIT_QOS_MAX_CPU_MINS_PER_JOB,/* QOS MaxCpusMinsPerJob exceeded */ - WAIT_QOS_MAX_NODE_PER_JOB, /* QOS MaxNodesPerJob exceeded */ + WAIT_QOS_MAX_CPU_PER_JOB, /* QOS MaxTRESPerJob exceeded (CPU) */ + WAIT_QOS_MAX_CPU_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob exceeded (CPU) */ + WAIT_QOS_MAX_NODE_PER_JOB, /* QOS MaxTRESPerJob exceeded (Node) */ WAIT_QOS_MAX_WALL_PER_JOB, /* QOS MaxWallDurationPerJob exceeded */ - WAIT_QOS_MAX_CPU_PER_USER, /* QOS MaxCpusPerUser exceeded */ + WAIT_QOS_MAX_CPU_PER_USER, /* QOS MaxTRESPerUser exceeded (CPU) */ WAIT_QOS_MAX_JOB_PER_USER, /* QOS MaxJobsPerUser exceeded */ - WAIT_QOS_MAX_NODE_PER_USER, /* QOS MaxNodesPerUser exceeded */ + WAIT_QOS_MAX_NODE_PER_USER, /* QOS MaxTRESPerUser exceeded (Node) */ WAIT_QOS_MAX_SUB_JOB, /* QOS MaxSubmitJobsPerUser exceeded */ - WAIT_QOS_MIN_CPUS, /* QOS MinCPUsPerJob not reached */ - WAIT_ASSOC_GRP_CPU, /* ASSOC GrpCpus exceeded */ - WAIT_ASSOC_GRP_CPU_MIN, /* ASSOC GrpCPUMins exceeded */ - WAIT_ASSOC_GRP_CPU_RUN_MIN, /* ASSOC GrpCPURunMins exceeded */ + WAIT_QOS_MIN_CPU, /* QOS MinTRESPerJob not reached (CPU) */ + WAIT_ASSOC_GRP_CPU, /* ASSOC GrpTRES exceeded (CPU) */ + WAIT_ASSOC_GRP_CPU_MIN, /* ASSOC GrpTRESMins exceeded (CPU) */ + WAIT_ASSOC_GRP_CPU_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded (CPU) */ WAIT_ASSOC_GRP_JOB, /* ASSOC GrpJobs exceeded */ - WAIT_ASSOC_GRP_MEMORY, /* ASSOC GrpMemory exceeded */ - WAIT_ASSOC_GRP_NODES, /* ASSOC GrpNodes exceeded */ + WAIT_ASSOC_GRP_MEM, /* ASSOC GrpTRES exceeded (Memory) */ + WAIT_ASSOC_GRP_NODE, /* ASSOC GrpTRES exceeded (Node) */ WAIT_ASSOC_GRP_SUB_JOB, /* ASSOC GrpSubmitJobs exceeded */ WAIT_ASSOC_GRP_WALL, /* ASSOC GrpWall exceeded */ WAIT_ASSOC_MAX_JOBS, /* ASSOC MaxJobs exceeded */ - WAIT_ASSOC_MAX_CPUS_PER_JOB, /* ASSOC MaxCpusPerJob exceeded */ - WAIT_ASSOC_MAX_CPU_MINS_PER_JOB,/* ASSOC MaxCpusMinsPerJob exceeded */ - WAIT_ASSOC_MAX_NODE_PER_JOB, /* ASSOC MaxNodesPerJob exceeded */ + WAIT_ASSOC_MAX_CPU_PER_JOB, /* ASSOC MaxTRESPerJob exceeded (CPU) */ + WAIT_ASSOC_MAX_CPU_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (CPU) */ + WAIT_ASSOC_MAX_NODE_PER_JOB, /* ASSOC MaxTRESPerJob exceeded (NODE) */ WAIT_ASSOC_MAX_WALL_PER_JOB, /* ASSOC MaxWallDurationPerJob * exceeded */ WAIT_ASSOC_MAX_SUB_JOB, /* ASSOC MaxSubmitJobsPerUser exceeded */ @@ -400,6 +401,125 @@ enum job_state_reason { WAIT_POWER_NOT_AVAIL, /* not enough power available */ WAIT_POWER_RESERVED, /* job is waiting for available power * because of power reservations */ + WAIT_ASSOC_GRP_UNK, /* ASSOC GrpTRES exceeded + * (Unknown) */ + WAIT_ASSOC_GRP_UNK_MIN, /* ASSOC GrpTRESMins exceeded + * (Unknown) */ + WAIT_ASSOC_GRP_UNK_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded + * (Unknown) */ + WAIT_ASSOC_MAX_UNK_PER_JOB, /* ASSOC MaxTRESPerJob exceeded + * (Unknown) */ + WAIT_ASSOC_MAX_UNK_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (Unknown) */ + WAIT_ASSOC_GRP_MEM_MIN, /* ASSOC GrpTRESMins exceeded + * (Memory) */ + WAIT_ASSOC_GRP_MEM_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded + * (Memory) */ + WAIT_ASSOC_MAX_MEM_PER_JOB, /* ASSOC MaxTRESPerJob exceeded (Memory) */ + WAIT_ASSOC_MAX_MEM_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (Memory) */ + WAIT_ASSOC_GRP_NODE_MIN, /* ASSOC GrpTRESMins exceeded (Node) */ + WAIT_ASSOC_GRP_NODE_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded (Node) */ + WAIT_ASSOC_MAX_NODE_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (Node) */ + WAIT_ASSOC_GRP_ENERGY, /* ASSOC GrpTRES exceeded + * (Energy) */ + WAIT_ASSOC_GRP_ENERGY_MIN, /* ASSOC GrpTRESMins exceeded + * (Energy) */ + WAIT_ASSOC_GRP_ENERGY_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded + * (Energy) */ + WAIT_ASSOC_MAX_ENERGY_PER_JOB, /* ASSOC MaxTRESPerJob exceeded + * (Energy) */ + WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (Energy) */ + WAIT_ASSOC_GRP_GRES, /* ASSOC GrpTRES exceeded (GRES) */ + WAIT_ASSOC_GRP_GRES_MIN, /* ASSOC GrpTRESMins exceeded (GRES) */ + WAIT_ASSOC_GRP_GRES_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded (GRES) */ + WAIT_ASSOC_MAX_GRES_PER_JOB, /* ASSOC MaxTRESPerJob exceeded (GRES) */ + WAIT_ASSOC_MAX_GRES_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob + * exceeded (GRES) */ + WAIT_ASSOC_GRP_LIC, /* ASSOC GrpTRES exceeded + * (license) */ + WAIT_ASSOC_GRP_LIC_MIN, /* ASSOC GrpTRESMins exceeded + * (license) */ + WAIT_ASSOC_GRP_LIC_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded + * (license) */ + WAIT_ASSOC_MAX_LIC_PER_JOB, /* ASSOC MaxTRESPerJob exceeded + * (license) */ + WAIT_ASSOC_MAX_LIC_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob exceeded + * (license) */ + WAIT_ASSOC_GRP_BB, /* ASSOC GrpTRES exceeded + * (burst buffer) */ + WAIT_ASSOC_GRP_BB_MIN, /* ASSOC GrpTRESMins exceeded + * (burst buffer) */ + WAIT_ASSOC_GRP_BB_RUN_MIN, /* ASSOC GrpTRESRunMins exceeded + * (burst buffer) */ + WAIT_ASSOC_MAX_BB_PER_JOB, /* ASSOC MaxTRESPerJob exceeded + * (burst buffer) */ + WAIT_ASSOC_MAX_BB_MINS_PER_JOB,/* ASSOC MaxTRESMinsPerJob exceeded + * (burst buffer) */ + WAIT_QOS_GRP_UNK, /* QOS GrpTRES exceeded (Unknown) */ + WAIT_QOS_GRP_UNK_MIN, /* QOS GrpTRESMins exceeded (Unknown) */ + WAIT_QOS_GRP_UNK_RUN_MIN, /* QOS GrpTRESRunMins exceeded (Unknown) */ + WAIT_QOS_MAX_UNK_PER_JOB, /* QOS MaxTRESPerJob exceeded (Unknown) */ + WAIT_QOS_MAX_UNK_PER_USER, /* QOS MaxTRESPerUser exceeded (Unknown) */ + WAIT_QOS_MAX_UNK_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob + * exceeded (Unknown) */ + WAIT_QOS_MIN_UNK, /* QOS MinTRESPerJob exceeded (Unknown) */ + WAIT_QOS_GRP_MEM_MIN, /* QOS GrpTRESMins exceeded + * (Memory) */ + WAIT_QOS_GRP_MEM_RUN_MIN, /* QOS GrpTRESRunMins exceeded + * (Memory) */ + WAIT_QOS_MAX_MEM_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob + * exceeded (Memory) */ + WAIT_QOS_MAX_MEM_PER_JOB, /* QOS MaxTRESPerJob exceeded (CPU) */ + WAIT_QOS_MAX_MEM_PER_USER, /* QOS MaxTRESPerUser exceeded (CPU) */ + WAIT_QOS_MIN_MEM, /* QOS MinTRESPerJob not reached (Memory) */ + WAIT_QOS_GRP_ENERGY, /* QOS GrpTRES exceeded (Energy) */ + WAIT_QOS_GRP_ENERGY_MIN, /* QOS GrpTRESMins exceeded (Energy) */ + WAIT_QOS_GRP_ENERGY_RUN_MIN, /* QOS GrpTRESRunMins exceeded (Energy) */ + WAIT_QOS_MAX_ENERGY_PER_JOB, /* QOS MaxTRESPerJob exceeded (Energy) */ + WAIT_QOS_MAX_ENERGY_PER_USER,/* QOS MaxTRESPerUser exceeded (Energy) */ + WAIT_QOS_MAX_ENERGY_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob + * exceeded (Energy) */ + WAIT_QOS_MIN_ENERGY, /* QOS MinTRESPerJob not reached (Energy) */ + WAIT_QOS_GRP_NODE_MIN, /* QOS GrpTRESMins exceeded (Node) */ + WAIT_QOS_GRP_NODE_RUN_MIN, /* QOS GrpTRESRunMins exceeded (Node) */ + WAIT_QOS_MAX_NODE_MINS_PER_JOB, /* QOS MaxTRESMinsPerJob + * exceeded (Node) */ + WAIT_QOS_MIN_NODE, /* QOS MinTRESPerJob not reached (Node) */ + WAIT_QOS_GRP_GRES, /* QOS GrpTRES exceeded (GRES) */ + WAIT_QOS_GRP_GRES_MIN, /* QOS GrpTRESMins exceeded (GRES) */ + WAIT_QOS_GRP_GRES_RUN_MIN, /* QOS GrpTRESRunMins exceeded (GRES) */ + WAIT_QOS_MAX_GRES_PER_JOB, /* QOS MaxTRESPerJob exceeded (GRES) */ + WAIT_QOS_MAX_GRES_PER_USER, /* QOS MaxTRESPerUser exceeded + * (GRES) */ + WAIT_QOS_MAX_GRES_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob + * exceeded (GRES) */ + WAIT_QOS_MIN_GRES, /* QOS MinTRESPerJob not reached (CPU) */ + WAIT_QOS_GRP_LIC, /* QOS GrpTRES exceeded (license) */ + WAIT_QOS_GRP_LIC_MIN, /* QOS GrpTRESMins exceeded (license) */ + WAIT_QOS_GRP_LIC_RUN_MIN, /* QOS GrpTRESRunMins exceeded (license) */ + WAIT_QOS_MAX_LIC_PER_JOB, /* QOS MaxTRESPerJob exceeded (license) */ + WAIT_QOS_MAX_LIC_PER_USER, /* QOS MaxTRESPerUser exceeded (license) */ + WAIT_QOS_MAX_LIC_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob exceeded + * (license) */ + WAIT_QOS_MIN_LIC, /* QOS MinTRESPerJob not reached + * (license) */ + WAIT_QOS_GRP_BB, /* QOS GrpTRES exceeded + * (burst buffer) */ + WAIT_QOS_GRP_BB_MIN, /* QOS GrpTRESMins exceeded + * (burst buffer) */ + WAIT_QOS_GRP_BB_RUN_MIN, /* QOS GrpTRESRunMins exceeded + * (burst buffer) */ + WAIT_QOS_MAX_BB_PER_JOB, /* QOS MaxTRESPerJob exceeded + * (burst buffer) */ + WAIT_QOS_MAX_BB_PER_USER, /* QOS MaxTRESPerUser exceeded + * (burst buffer) */ + WAIT_QOS_MAX_BB_MINS_PER_JOB,/* QOS MaxTRESMinsPerJob exceeded + * (burst buffer) */ + WAIT_QOS_MIN_BB, /* QOS MinTRESPerJob not reached + * (burst buffer) */ }; enum job_acct_types { diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 420be2d2415..6567e0e9890 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1335,32 +1335,32 @@ extern char *job_reason_string(enum job_state_reason inx) return "QOSGrpCPURunMinsLimit"; case WAIT_QOS_GRP_JOB: return"QOSGrpJobsLimit"; - case WAIT_QOS_GRP_MEMORY: - return "QOSGrpMemoryLimit"; - case WAIT_QOS_GRP_NODES: - return "QOSGrpNodesLimit"; + case WAIT_QOS_GRP_MEM: + return "QOSGrpMemLimit"; + case WAIT_QOS_GRP_NODE: + return "QOSGrpNodeLimit"; case WAIT_QOS_GRP_SUB_JOB: return "QOSGrpSubmitJobsLimit"; case WAIT_QOS_GRP_WALL: return "QOSGrpWallLimit"; - case WAIT_QOS_MAX_CPUS_PER_JOB: - return "QOSMaxCpusPerJobLimit"; + case WAIT_QOS_MAX_CPU_PER_JOB: + return "QOSMaxCpuPerJobLimit"; case WAIT_QOS_MAX_CPU_MINS_PER_JOB: - return "QOSMaxCpusMinsPerJobLimit"; + return "QOSMaxCpuMinsPerJobLimit"; case WAIT_QOS_MAX_NODE_PER_JOB: - return "QOSMaxNodesPerJobLimit"; + return "QOSMaxNodePerJobLimit"; case WAIT_QOS_MAX_WALL_PER_JOB: return "QOSMaxWallDurationPerJobLimit"; case WAIT_QOS_MAX_CPU_PER_USER: - return "QOSMaxCpusPerUserLimit"; + return "QOSMaxCpuPerUserLimit"; case WAIT_QOS_MAX_JOB_PER_USER: return "QOSMaxJobsPerUserLimit"; case WAIT_QOS_MAX_NODE_PER_USER: - return "QOSMaxNodesPerUserLimit"; + return "QOSMaxNodePerUserLimit"; case WAIT_QOS_MAX_SUB_JOB: return "QOSMaxSubmitJobPerUserLimit"; - case WAIT_QOS_MIN_CPUS: - return "QOSMinCPUsNotSatisfied"; + case WAIT_QOS_MIN_CPU: + return "QOSMinCpuNotSatisfied"; case WAIT_ASSOC_GRP_CPU: return "AssocGrpCpuLimit"; case WAIT_ASSOC_GRP_CPU_MIN: @@ -1369,22 +1369,22 @@ extern char *job_reason_string(enum job_state_reason inx) return "AssocGrpCPURunMinsLimit"; case WAIT_ASSOC_GRP_JOB: return"AssocGrpJobsLimit"; - case WAIT_ASSOC_GRP_MEMORY: - return "AssocGrpMemoryLimit"; - case WAIT_ASSOC_GRP_NODES: - return "AssocGrpNodesLimit"; + case WAIT_ASSOC_GRP_MEM: + return "AssocGrpMemLimit"; + case WAIT_ASSOC_GRP_NODE: + return "AssocGrpNodeLimit"; case WAIT_ASSOC_GRP_SUB_JOB: return "AssocGrpSubmitJobsLimit"; case WAIT_ASSOC_GRP_WALL: return "AssocGrpWallLimit"; case WAIT_ASSOC_MAX_JOBS: return "AssocMaxJobsLimit"; - case WAIT_ASSOC_MAX_CPUS_PER_JOB: - return "AssocMaxCpusPerJobLimit"; + case WAIT_ASSOC_MAX_CPU_PER_JOB: + return "AssocMaxCpuPerJobLimit"; case WAIT_ASSOC_MAX_CPU_MINS_PER_JOB: - return "AssocMaxCpusMinsPerJobLimit"; + return "AssocMaxCpuMinsPerJobLimit"; case WAIT_ASSOC_MAX_NODE_PER_JOB: - return "AssocMaxNodesPerJobLimit"; + return "AssocMaxNodePerJobLimit"; case WAIT_ASSOC_MAX_WALL_PER_JOB: return "AssocMaxWallDurationPerJobLimit"; case WAIT_ASSOC_MAX_SUB_JOB: @@ -1403,6 +1403,161 @@ extern char *job_reason_string(enum job_state_reason inx) return "PowerNotAvail"; case WAIT_POWER_RESERVED: return "PowerReserved"; + case WAIT_ASSOC_GRP_UNK: + return "AssocGrpUnknown"; + case WAIT_ASSOC_GRP_UNK_MIN: + return "AssocGrpUnknownMin"; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return "AssocGrpUnknownRunMin"; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return "AssocMaxUnknownPerJob"; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return "AssocMaxUnknownMinPerJob"; + case WAIT_ASSOC_GRP_MEM_MIN: + return "AssocGrpMemMin"; + case WAIT_ASSOC_GRP_MEM_RUN_MIN: + return "AssocGrpMemRunMin"; + case WAIT_ASSOC_MAX_MEM_PER_JOB: + return "AssocMaxMemPerJob"; + case WAIT_ASSOC_MAX_MEM_MINS_PER_JOB: + return "AssocMaxMemMinPerJob"; + case WAIT_ASSOC_GRP_NODE_MIN: + return "AssocGrpNodeMin"; + case WAIT_ASSOC_GRP_NODE_RUN_MIN: + return "AssocGrpNodeRunMin"; + case WAIT_ASSOC_MAX_NODE_MINS_PER_JOB: + return "AssocMaxNodeMinPerJob"; + case WAIT_ASSOC_GRP_ENERGY: + return "AssocGrpEnergy"; + case WAIT_ASSOC_GRP_ENERGY_MIN: + return "AssocGrpEnergyMin"; + case WAIT_ASSOC_GRP_ENERGY_RUN_MIN: + return "AssocGrpEnergyRunMin"; + case WAIT_ASSOC_MAX_ENERGY_PER_JOB: + return "AssocMaxEnergyPerJob"; + case WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB: + return "AssocMaxEnergyMinPerJob"; + case WAIT_ASSOC_GRP_GRES: + return "AssocGrpGRES"; + case WAIT_ASSOC_GRP_GRES_MIN: + return "AssocGrpGRESMin"; + case WAIT_ASSOC_GRP_GRES_RUN_MIN: + return "AssocGrpGRESRunMin"; + case WAIT_ASSOC_MAX_GRES_PER_JOB: + return "AssocMaxGRESPerJob"; + case WAIT_ASSOC_MAX_GRES_MINS_PER_JOB: + return "AssocMaxGRESMinPerJob"; + case WAIT_ASSOC_GRP_LIC: + return "AssocGrpLicense"; + case WAIT_ASSOC_GRP_LIC_MIN: + return "AssocGrpLicenseMin"; + case WAIT_ASSOC_GRP_LIC_RUN_MIN: + return "AssocGrpLicenseRunMin"; + case WAIT_ASSOC_MAX_LIC_PER_JOB: + return "AssocMaxLicensePerJob"; + case WAIT_ASSOC_MAX_LIC_MINS_PER_JOB: + return "AssocMaxLicenseMinPerJob"; + case WAIT_ASSOC_GRP_BB: + return "AssocGrpBB"; + case WAIT_ASSOC_GRP_BB_MIN: + return "AssocGrpBBMin"; + case WAIT_ASSOC_GRP_BB_RUN_MIN: + return "AssocGrpBBRunMin"; + case WAIT_ASSOC_MAX_BB_PER_JOB: + return "AssocMaxBBPerJob"; + case WAIT_ASSOC_MAX_BB_MINS_PER_JOB: + return "AssocMaxBBMinPerJob"; + + case WAIT_QOS_GRP_UNK: + return "QOSGrpUnknown"; + case WAIT_QOS_GRP_UNK_MIN: + return "QOSGrpUnknownMin"; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return "QOSGrpUnknownRunMin"; + case WAIT_QOS_MAX_UNK_PER_JOB: + return "QOSMaxUnknownPerJob"; + case WAIT_QOS_MAX_UNK_PER_USER: + return "QOSMaxUnknownPerUser"; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return "QOSMaxUnknownMinPerJob"; + case WAIT_QOS_MIN_UNK: + return "QOSMinUnknown"; + case WAIT_QOS_GRP_MEM_MIN: + return "QOSGrpMemoryMin"; + case WAIT_QOS_GRP_MEM_RUN_MIN: + return "QOSGrpMemoryRunMin"; + case WAIT_QOS_MAX_MEM_PER_JOB: + return "QOSMaxMemoryPerJob"; + case WAIT_QOS_MAX_MEM_PER_USER: + return "QOSMaxMemoryPerUser"; + case WAIT_QOS_MAX_MEM_MINS_PER_JOB: + return "QOSMaxMemoryMinPerJob"; + case WAIT_QOS_MIN_MEM: + return "QOSMinMemory"; + case WAIT_QOS_GRP_NODE_MIN: + return "QOSGrpNodeMin"; + case WAIT_QOS_GRP_NODE_RUN_MIN: + return "QOSGrpNodeRunMin"; + case WAIT_QOS_MAX_NODE_MINS_PER_JOB: + return "QOSMaxNodeMinPerJob"; + case WAIT_QOS_MIN_NODE: + return "QOSMinNode"; + case WAIT_QOS_GRP_ENERGY: + return "QOSGrpEnergy"; + case WAIT_QOS_GRP_ENERGY_MIN: + return "QOSGrpEnergyMin"; + case WAIT_QOS_GRP_ENERGY_RUN_MIN: + return "QOSGrpEnergyRunMin"; + case WAIT_QOS_MAX_ENERGY_PER_JOB: + return "QOSMaxEnergyPerJob"; + case WAIT_QOS_MAX_ENERGY_PER_USER: + return "QOSMaxEnergyPerUser"; + case WAIT_QOS_MAX_ENERGY_MINS_PER_JOB: + return "QOSMaxEnergyMinPerJob"; + case WAIT_QOS_MIN_ENERGY: + return "QOSMinEnergy"; + case WAIT_QOS_GRP_GRES: + return "QOSGrpGRES"; + case WAIT_QOS_GRP_GRES_MIN: + return "QOSGrpGRESMin"; + case WAIT_QOS_GRP_GRES_RUN_MIN: + return "QOSGrpGRESRunMin"; + case WAIT_QOS_MAX_GRES_PER_JOB: + return "QOSMaxGRESPerJob"; + case WAIT_QOS_MAX_GRES_PER_USER: + return "QOSMaxGRESPerUser"; + case WAIT_QOS_MAX_GRES_MINS_PER_JOB: + return "QOSMaxGRESMinPerJob"; + case WAIT_QOS_MIN_GRES: + return "QOSMinGRES"; + case WAIT_QOS_GRP_LIC: + return "QOSGrpLicense"; + case WAIT_QOS_GRP_LIC_MIN: + return "QOSGrpLicenseMin"; + case WAIT_QOS_GRP_LIC_RUN_MIN: + return "QOSGrpLicenseRunMin"; + case WAIT_QOS_MAX_LIC_PER_JOB: + return "QOSMaxLicensePerJob"; + case WAIT_QOS_MAX_LIC_PER_USER: + return "QOSMaxLicensePerUser"; + case WAIT_QOS_MAX_LIC_MINS_PER_JOB: + return "QOSMaxLicenseMinPerJob"; + case WAIT_QOS_MIN_LIC: + return "QOSMinLicense"; + case WAIT_QOS_GRP_BB: + return "QOSGrpBB"; + case WAIT_QOS_GRP_BB_MIN: + return "QOSGrpBBMin"; + case WAIT_QOS_GRP_BB_RUN_MIN: + return "QOSGrpBBRunMin"; + case WAIT_QOS_MAX_BB_PER_JOB: + return "QOSMaxBBPerJob"; + case WAIT_QOS_MAX_BB_PER_USER: + return "QOSMaxBBPerUser"; + case WAIT_QOS_MAX_BB_MINS_PER_JOB: + return "AssocMaxBBMinPerJob"; + case WAIT_QOS_MIN_BB: + return "QOSMinBB"; default: snprintf(val, sizeof(val), "%d", inx); return val; diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 2ae0e075d5a..1eee69a091c 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -59,6 +59,231 @@ enum { ACCT_POLICY_JOB_FINI }; +static int get_tres_state_reason(int tres_pos, int unk_reason) +{ + switch (tres_pos) { + case TRES_ARRAY_CPU: + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_CPU; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_CPU_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_CPU_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_CPU_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_CPU_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_CPU; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_CPU_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_CPU_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_CPU_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_CPU_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_CPU_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_CPU; + default: + return unk_reason; + break; + } + break; + case TRES_ARRAY_MEM: + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_MEM; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_MEM_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_MEM_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_MEM_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_MEM_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_MEM; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_MEM_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_MEM_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_MEM_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_MEM_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_MEM_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_MEM; + default: + return unk_reason; + break; + } + break; + case TRES_ARRAY_ENEGRY: + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_ENERGY; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_ENERGY_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_ENERGY_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_ENERGY_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_ENERGY_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_ENERGY; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_ENERGY_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_ENERGY_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_ENERGY_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_ENERGY_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_ENERGY_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_ENERGY; + default: + return unk_reason; + break; + } + break; + case TRES_ARRAY_NODE: + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_NODE; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_NODE_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_NODE_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_NODE_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_NODE_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_NODE; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_NODE_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_NODE_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_NODE_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_NODE_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_NODE_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_NODE; + default: + return unk_reason; + break; + } + break; + default: + if (!xstrcmp("gres", assoc_mgr_tres_array[tres_pos]->type)) + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_GRES; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_GRES_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_GRES_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_GRES_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_GRES_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_GRES; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_GRES_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_GRES_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_GRES_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_GRES_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_GRES_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_GRES; + default: + return unk_reason; + break; + } + else if (!xstrcmp("license", + assoc_mgr_tres_array[tres_pos]->type)) + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_LIC; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_LIC_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_LIC_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_LIC_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_LIC_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_LIC; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_LIC_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_LIC_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_LIC_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_LIC_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_LIC_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_LIC; + default: + return unk_reason; + break; + } + else if (!xstrcmp("bb", assoc_mgr_tres_array[tres_pos]->type)) + switch (unk_reason) { + case WAIT_ASSOC_GRP_UNK: + return WAIT_ASSOC_GRP_BB; + case WAIT_ASSOC_GRP_UNK_MIN: + return WAIT_ASSOC_GRP_BB_MIN; + case WAIT_ASSOC_GRP_UNK_RUN_MIN: + return WAIT_ASSOC_GRP_BB_RUN_MIN; + case WAIT_ASSOC_MAX_UNK_PER_JOB: + return WAIT_ASSOC_MAX_BB_PER_JOB; + case WAIT_ASSOC_MAX_UNK_MINS_PER_JOB: + return WAIT_ASSOC_MAX_BB_MINS_PER_JOB; + case WAIT_QOS_GRP_UNK: + return WAIT_QOS_GRP_BB; + case WAIT_QOS_GRP_UNK_MIN: + return WAIT_QOS_GRP_BB_MIN; + case WAIT_QOS_GRP_UNK_RUN_MIN: + return WAIT_QOS_GRP_BB_RUN_MIN; + case WAIT_QOS_MAX_UNK_PER_JOB: + return WAIT_QOS_MAX_BB_PER_JOB; + case WAIT_QOS_MAX_UNK_PER_USER: + return WAIT_QOS_MAX_BB_PER_USER; + case WAIT_QOS_MAX_UNK_MINS_PER_JOB: + return WAIT_QOS_MAX_BB_MINS_PER_JOB; + case WAIT_QOS_MIN_UNK: + return WAIT_QOS_MIN_BB; + default: + return unk_reason; + break; + } + break; + } + + return unk_reason; +} + static void _set_qos_order(struct job_record *job_ptr, slurmdb_qos_rec_t **qos_ptr_1, slurmdb_qos_rec_t **qos_ptr_2) @@ -745,7 +970,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, if (job_desc->tres_req_cnt[tres_pos] > qos_ptr->max_tres_pu_ctld[tres_pos]) { if (reason) - *reason = WAIT_QOS_MAX_CPU_PER_USER; + *reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_USER); debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " @@ -764,7 +990,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, } else if (job_desc->tres_req_cnt[tres_pos] > qos_ptr->grp_tres_ctld[tres_pos]) { if (reason) - *reason = WAIT_QOS_GRP_CPU; + *reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK); debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " @@ -825,6 +1052,9 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, qos_out_ptr->max_tres_mins_pj_ctld, &acct_policy_limit_set->time, strict_checking)) { + if (reason) + *reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_JOB); debug2("job submit for user %s(%u): " "tres(%s%s%s) time limit request %"PRIu64" " "exceeds max per-job limit %"PRIu64" " @@ -839,7 +1069,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, job_desc->tres_req_cnt[tres_pos]), qos_ptr->max_tres_mins_pj_ctld[tres_pos], qos_ptr->name); - + rc = false; + goto end_it; } if ((qos_out_ptr->max_wall_pj == INFINITE) && @@ -897,7 +1128,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, acct_policy_limit_set->tres, strict_checking, 1)) { if (reason) - *reason = WAIT_QOS_MAX_CPUS_PER_JOB; + *reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_JOB); debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " @@ -957,7 +1189,8 @@ static int _qos_policy_validate(job_desc_msg_t *job_desc, acct_policy_limit_set->tres, strict_checking, 0)) { if (reason) - *reason = WAIT_QOS_MIN_CPUS; + *reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MIN_UNK); debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " @@ -1170,7 +1403,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, switch (i) { case 1: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK_MIN); debug2("Job %u being held, " "QOS %s group max tres(%s%s%s) minutes limit " "of %"PRIu64" is already at or exceeded with %"PRIu64, @@ -1187,7 +1421,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK_MIN); debug2("Job %u being held, " "the job is requesting more than allowed with QOS %s's " "group max tres(%s%s%s) minutes of %"PRIu64" " @@ -1212,7 +1447,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, * being killed */ xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK_MIN); debug2("Job %u being held, " "the job is at or exceeds QOS %s's " "group max tres(%s%s%s) minutes of %"PRIu64" " @@ -1255,7 +1491,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK); debug2("job %u is being held, " "QOS %s min tres(%s%s%s) request %"PRIu64" exceeds " "group max tres limit %"PRIu64, @@ -1272,7 +1509,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, break; case 3: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK); debug2("job %u being held, " "if allowed the job request will exceed " "QOS %s group max tres(%s%s%s) limit " @@ -1307,7 +1545,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU_RUN_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN); debug2("job %u is being held, " "QOS %s group max running tres(%s%s%s) minutes " "limit %"PRIu64" is already full with %"PRIu64, @@ -1324,7 +1563,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, break; case 3: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_GRP_CPU_RUN_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_GRP_UNK_RUN_MIN); debug2("job %u being held, " "if allowed the job request will exceed " "QOS %s group max running tres(%s%s%s) minutes " @@ -1359,7 +1599,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, job_ptr->limit_set.tres, 1, 1)) { xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_MAX_CPU_MINS_PER_JOB; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_MINS_PER_JOB); debug2("Job %u being held, " "the job is requesting more than allowed with QOS %s's " "max tres(%s%s%s) minutes of %"PRIu64" " @@ -1385,7 +1626,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, job_ptr->limit_set.tres, 1, 1)) { xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_MAX_CPUS_PER_JOB; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_JOB); debug2("job %u is being held, " "QOS %s min tres(%s%s%s) per job " "request %"PRIu64" exceeds " @@ -1411,7 +1653,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, job_ptr->limit_set.tres, 1, 0)) { xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_MIN_CPUS; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MIN_UNK); debug2("job %u is being held, " "QOS %s min tres(%s%s%s) per job " "request %"PRIu64" exceeds " @@ -1442,7 +1685,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, * TRES limit for the given QOS */ xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_MAX_CPU_PER_USER; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_USER); debug2("job %u is being held, " "QOS %s min tres(%s%s%s) " "request %"PRIu64" exceeds " @@ -1463,7 +1707,8 @@ static int _qos_job_runnable_post_select(struct job_record *job_ptr, * the QOS per-user TRES limit with their * current usage */ xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_QOS_MAX_CPU_PER_USER; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_QOS_MAX_UNK_PER_USER); debug2("job %u being held, " "if allowed the job request will exceed " "QOS %s max tres(%s%s%s) per user limit " @@ -1792,7 +2037,9 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, we want to send back. */ if (reason) - *reason = WAIT_ASSOC_GRP_CPU; + *reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK); + debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " "group max tres limit %"PRIu64" for account %s", @@ -1859,7 +2106,9 @@ extern bool acct_policy_validate(job_desc_msg_t *job_desc, we want to send back. */ if (reason) - *reason = WAIT_ASSOC_MAX_CPUS_PER_JOB; + *reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB); + debug2("job submit for user %s(%u): " "min tres(%s%s%s) request %"PRIu64" exceeds " "max tres limit %"PRIu64" for account %s", @@ -2244,7 +2493,8 @@ extern bool acct_policy_job_runnable_post_select( switch (i) { case 1: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK_MIN); debug2("Job %u being held, " "assoc %u(%s/%s/%s) group max tres(%s%s%s) " "minutes limit of %"PRIu64" is already at or " @@ -2263,7 +2513,8 @@ extern bool acct_policy_job_runnable_post_select( break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK_MIN); debug2("Job %u being held, " "the job is requesting more than allowed " "with assoc %u(%s/%s/%s) " @@ -2290,7 +2541,8 @@ extern bool acct_policy_job_runnable_post_select( * being killed */ xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK_MIN); debug2("Job %u being held, " "the job is at or exceeds assoc %u(%s/%s/%s) " "group max tres(%s%s%s) minutes of %"PRIu64" " @@ -2332,7 +2584,8 @@ extern bool acct_policy_job_runnable_post_select( break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK); debug2("job %u is being held, " "assoc %u(%s/%s/%s) min tres(%s%s%s) " "request %"PRIu64" exceeds " @@ -2351,7 +2604,8 @@ extern bool acct_policy_job_runnable_post_select( break; case 3: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK); debug2("job %u being held, " "if allowed the job request will exceed " "assoc %u(%s/%s/%s) group max " @@ -2388,7 +2642,8 @@ extern bool acct_policy_job_runnable_post_select( break; case 2: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU_RUN_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN); debug2("job %u is being held, " "assoc %u(%s/%s/%s) group max running " "tres(%s%s%s) minutes limit %"PRIu64 @@ -2407,7 +2662,8 @@ extern bool acct_policy_job_runnable_post_select( break; case 3: xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_GRP_CPU_RUN_MIN; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_GRP_UNK_RUN_MIN); debug2("job %u being held, " "if allowed the job request will exceed " "assoc %u(%s/%s/%s) group max running " @@ -2453,7 +2709,8 @@ extern bool acct_policy_job_runnable_post_select( job_ptr->limit_set.tres, 1, 0, 1)) { xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_MAX_CPU_MINS_PER_JOB; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_MAX_UNK_MINS_PER_JOB); debug2("Job %u being held, " "the job is requesting more than allowed " "with assoc %u(%s/%s/%s) max tres(%s%s%s) " @@ -2478,7 +2735,8 @@ extern bool acct_policy_job_runnable_post_select( job_ptr->limit_set.tres, 1, 0, 1)) { xfree(job_ptr->state_desc); - job_ptr->state_reason = WAIT_ASSOC_MAX_CPUS_PER_JOB; + job_ptr->state_reason = get_tres_state_reason( + tres_pos, WAIT_ASSOC_MAX_UNK_PER_JOB); debug2("job %u is being held, " "the job is requesting more than allowed " "with assoc %u(%s/%s/%s) max tres(%s%s%s) " @@ -2567,7 +2825,7 @@ extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr, if (grp_nodes < max_nodes_limit) { max_nodes_limit = grp_nodes; - *wait_reason = WAIT_QOS_GRP_NODES; + *wait_reason = WAIT_QOS_GRP_NODE; } } @@ -2581,7 +2839,7 @@ extern uint32_t acct_policy_get_max_nodes(struct job_record *job_ptr, max_nodes_limit)) { max_nodes_limit = assoc_ptr->grp_tres_ctld[TRES_ARRAY_NODE]; - *wait_reason = WAIT_ASSOC_GRP_NODES; + *wait_reason = WAIT_ASSOC_GRP_NODE; grp_set = 1; } -- GitLab