From 2b3bb18532a073d1f9599c20bac3e7b829f4d3c5 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 12 Jul 2010 17:52:06 +0000 Subject: [PATCH] Move pretty much all of the gres logic from the plugin into common/gres.c --- src/common/gres.c | 949 +++++++++++++++++++++++++++----- src/common/gres.h | 32 ++ src/plugins/gres/gpu/gres_gpu.c | 805 --------------------------- src/plugins/gres/nic/gres_nic.c | 805 --------------------------- 4 files changed, 849 insertions(+), 1742 deletions(-) diff --git a/src/common/gres.c b/src/common/gres.c index 52078056d1a..21b8c705c0e 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -85,56 +85,6 @@ typedef struct slurm_gres_ops { char (*gres_name); char (*help_msg); int (*node_config_load) ( List gres_conf_list ); - void * (*node_state_dup) ( void *gres_data ); - void (*node_state_dealloc) ( void *gres_data ); - int (*node_state_realloc) ( void *job_gres_data, - int node_offset, - void *node_gres_data ); - void (*node_state_log) ( void *gres_data, - char *node_name ); - - void (*job_state_delete) ( void *gres_data ); - int (*job_state_validate) ( char *config, - void **gres_data ); - void * (*job_state_dup) ( void *gres_data ); - int (*job_state_pack) ( void *gres_data, - Buf buffer ); - int (*job_state_unpack) ( void **gres_data, - Buf buffer ); - void (*job_state_log) ( void *gres_data, - uint32_t job_id ); - uint32_t (*job_test) ( void *job_gres_data, - void *node_gres_data, - bool use_total_gres ); - int (*job_alloc) ( void *job_gres_data, - void *node_gres_data, - int node_cnt, - int node_offset, - uint32_t cpu_cnt ); - int (*job_dealloc) ( void *job_gres_data, - void *node_gres_data, - int node_offset ); - void (*step_state_delete) ( void *gres_data ); - int (*step_state_validate) ( char *config, - void **gres_data ); - void * (*step_state_dup) ( void *gres_data ); - int (*step_state_pack) ( void *gres_data, - Buf buffer ); - int (*step_state_unpack) ( void **gres_data, - Buf buffer ); - void (*step_state_log) ( void *gres_data, - uint32_t job_id, - uint32_t step_id ); - uint32_t (*step_test) ( void *job_gres_data, - void *step_gres_data, - int node_offset, - bool ignore_alloc ); - uint32_t (*step_alloc) ( void *job_gres_data, - void *step_gres_data, - int node_offset, - uint32_t cpu_cnt ); - uint32_t (*step_dealloc) ( void *job_gres_data, - void *step_gres_data ); } slurm_gres_ops_t; typedef struct slurm_gres_context { @@ -181,28 +131,6 @@ static int _load_gres_plugin(char *plugin_name, "gres_name", "help_msg", "node_config_load", - "node_state_dup", - "node_state_dealloc", - "node_state_realloc", - "node_state_log", - "job_state_delete", - "job_state_validate", - "job_state_dup", - "job_state_pack", - "job_state_unpack", - "job_state_log", - "job_test", - "job_alloc", - "job_dealloc", - "step_state_delete", - "step_state_validate", - "step_state_dup", - "step_state_pack", - "step_state_unpack", - "step_state_log", - "step_test", - "step_alloc", - "step_dealloc" }; int n_syms = sizeof(syms) / sizeof(char *); @@ -900,7 +828,7 @@ extern int _node_config_validate(char *node_name, uint32_t gres_cnt, updated_config = true; } else if (gres_ptr->gres_cnt_found != gres_cnt) { if (gres_ptr->gres_cnt_found != NO_VAL) { - info("gres/%s count changed for node %s from %u to %u", + info("gres/%s:count changed for node %s from %u to %u", gres_name, node_name, gres_ptr->gres_cnt_found, gres_cnt); } @@ -1412,6 +1340,24 @@ unpack_error: goto fini; } +static void *_node_state_dup(void *gres_data) +{ + gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; + gres_node_state_t *new_gres; + + if (gres_ptr == NULL) + return NULL; + + new_gres = xmalloc(sizeof(gres_node_state_t)); + new_gres->gres_cnt_found = gres_ptr->gres_cnt_found; + new_gres->gres_cnt_config = gres_ptr->gres_cnt_config; + new_gres->gres_cnt_avail = gres_ptr->gres_cnt_avail; + new_gres->gres_cnt_alloc = gres_ptr->gres_cnt_alloc; + new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc); + + return new_gres; +} + /* * Duplicate a node gres status (used for will-run logic) * IN gres_list - node gres state information @@ -1442,8 +1388,7 @@ extern List gres_plugin_node_state_dup(List gres_list) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - gres_data = (*(gres_context[i].ops.node_state_dup)) - (gres_ptr->gres_data); + gres_data = _node_state_dup(gres_ptr->gres_data); if (gres_data) { new_gres = xmalloc(sizeof(gres_state_t)); new_gres->plugin_id = gres_ptr->plugin_id; @@ -1463,6 +1408,18 @@ extern List gres_plugin_node_state_dup(List gres_list) return new_list; } +static void _node_state_dealloc(void *gres_data) +{ + gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data; + + gres_ptr->gres_cnt_alloc = 0; + if (gres_ptr->gres_bit_alloc) { + int i = bit_size(gres_ptr->gres_bit_alloc) - 1; + if (i > 0) + bit_nclear(gres_ptr->gres_bit_alloc, 0, i); + } +} + /* * Deallocate all resources on this node previous allocated to any jobs. * This function isused to synchronize state after slurmctld restarts or @@ -1487,8 +1444,7 @@ extern void gres_plugin_node_state_dealloc(List gres_list) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.node_state_dealloc)) - (gres_ptr->gres_data); + _node_state_dealloc(gres_ptr->gres_data); break; } } @@ -1496,6 +1452,70 @@ extern void gres_plugin_node_state_dealloc(List gres_list) slurm_mutex_unlock(&gres_context_lock); } +static int _node_state_realloc(void *job_gres_data, int node_offset, + void *node_gres_data, char *gres_name) +{ + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data; + int i, job_bit_size, node_bit_size; + + xassert(job_gres_ptr); + xassert(node_gres_ptr); + + if (node_offset >= job_gres_ptr->node_cnt) { + error("gres: %s job node offset is bad (%d >= %u)", + gres_name, node_offset, job_gres_ptr->node_cnt); + return EINVAL; + } + + if ((job_gres_ptr->gres_bit_alloc == NULL) || + (job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) { + error("gres/%s:job bit_alloc is NULL", gres_name); + return EINVAL; + } + + if (node_gres_ptr->gres_bit_alloc == NULL) { + error("gres/%s: node bit_alloc is NULL", gres_name); + return EINVAL; + } + + job_bit_size = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]); + node_bit_size = bit_size(node_gres_ptr->gres_bit_alloc); + if (job_bit_size > node_bit_size) { + error("gres/%s: job/node bit size mismatch (%d != %d)", + gres_name, job_bit_size, node_bit_size); + /* Node needs to register with more resources, expand + * node's bitmap now so we can merge the data */ + node_gres_ptr->gres_bit_alloc = + bit_realloc(node_gres_ptr->gres_bit_alloc, + job_bit_size); + if (node_gres_ptr->gres_bit_alloc == NULL) + fatal("bit_realloc: malloc failure"); + node_bit_size = job_bit_size; + } + if (job_bit_size < node_bit_size) { + error("gres/%s: job/node bit size mismatch (%d != %d)", + gres_name, job_bit_size, node_bit_size); + /* Update what we can */ + node_bit_size = MIN(job_bit_size, node_bit_size); + for (i=0; i<node_bit_size; i++) { + if (!bit_test(job_gres_ptr->gres_bit_alloc[node_offset], + i)) + continue; + node_gres_ptr->gres_cnt_alloc++; + bit_set(node_gres_ptr->gres_bit_alloc, i); + } + } else { + node_gres_ptr->gres_cnt_alloc += bit_set_count(job_gres_ptr-> + gres_bit_alloc + [node_offset]); + bit_or(node_gres_ptr->gres_bit_alloc, + job_gres_ptr->gres_bit_alloc[node_offset]); + } + + return SLURM_SUCCESS; +} + /* * Allocate in this nodes record the resources previously allocated to this * job. This function isused to synchronize state after slurmctld restarts @@ -1539,9 +1559,10 @@ extern int gres_plugin_node_state_realloc(List job_gres_list, int node_offset, if (job_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.node_state_realloc)) - (job_gres_ptr->gres_data, node_offset, - node_gres_ptr->gres_data); + _node_state_realloc(job_gres_ptr->gres_data, + node_offset, + node_gres_ptr->gres_data, + gres_context[i].ops.gres_name); break; } } @@ -1551,6 +1572,25 @@ extern int gres_plugin_node_state_realloc(List job_gres_list, int node_offset, return SLURM_SUCCESS; } +static void _node_state_log(void *gres_data, char *node_name, char *gres_name) +{ + gres_node_state_t *gres_ptr; + + xassert(gres_data); + gres_ptr = (gres_node_state_t *) gres_data; + info("gres/%s: state for %s", gres_name, node_name); + info(" gres_cnt found:%u configured:%u avail:%u alloc:%u", + gres_ptr->gres_cnt_found, gres_ptr->gres_cnt_config, + gres_ptr->gres_cnt_avail, gres_ptr->gres_cnt_alloc); + if (gres_ptr->gres_bit_alloc) { + char tmp_str[128]; + bit_fmt(tmp_str, sizeof(tmp_str), gres_ptr->gres_bit_alloc); + info(" gres_bit_alloc:%s", tmp_str); + } else { + info(" gres_bit_alloc:NULL"); + } +} + /* * Log a node's current gres state * IN gres_list - generated by gres_plugin_node_config_validate() @@ -1574,8 +1614,8 @@ extern void gres_plugin_node_state_log(List gres_list, char *node_name) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.node_state_log)) - (gres_ptr->gres_data, node_name); + _node_state_log(gres_ptr->gres_data, node_name, + gres_context[i].ops.gres_name); break; } } @@ -1583,6 +1623,27 @@ extern void gres_plugin_node_state_log(List gres_list, char *node_name) slurm_mutex_unlock(&gres_context_lock); } +static void _job_state_delete(void *gres_data) +{ + int i; + gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data; + + if (gres_ptr == NULL) + return; + + if (gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) + FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]); + xfree(gres_ptr->gres_bit_alloc); + } + if (gres_ptr->gres_bit_step_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) + FREE_NULL_BITMAP(gres_ptr->gres_bit_step_alloc[i]); + xfree(gres_ptr->gres_bit_step_alloc); + } + xfree(gres_ptr); +} + static void _gres_job_list_delete(void *list_element) { int i; @@ -1596,13 +1657,48 @@ static void _gres_job_list_delete(void *list_element) for (i=0; i<gres_context_cnt; i++) { if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.job_state_delete))(gres_ptr->gres_data); + _job_state_delete(gres_ptr->gres_data); xfree(gres_ptr); break; } slurm_mutex_unlock(&gres_context_lock); } +static int _job_state_validate(char *config, void **gres_data, char *gres_name) +{ + char *last = NULL; + char name_colon[128]; + int name_colon_len; + gres_job_state_t *gres_ptr; + uint32_t cnt; + uint8_t mult = 0; + + name_colon_len = snprintf(name_colon, sizeof(name_colon), "%s:", + gres_name); + if (!strcmp(config, gres_name)) { + cnt = 1; + } else if (!strncmp(config, name_colon, name_colon_len)) { + cnt = strtol(config+name_colon_len, &last, 10); + if (last[0] == '\0') + ; + else if ((last[0] == 'k') || (last[0] == 'K')) + cnt *= 1024; + else if (!strcasecmp(last, "*cpu")) + mult = 1; + else + return SLURM_ERROR; + if (cnt == 0) + return SLURM_ERROR; + } else + return SLURM_ERROR; + + gres_ptr = xmalloc(sizeof(gres_job_state_t)); + gres_ptr->gres_cnt_alloc = cnt; + gres_ptr->gres_cnt_mult = mult; + *gres_data = gres_ptr; + return SLURM_SUCCESS; +} + /* * Given a job's requested gres configuration, validate it and build a gres list * IN req_config - job request's gres input string @@ -1636,8 +1732,8 @@ extern int gres_plugin_job_state_validate(char *req_config, List *gres_list) while (tok && (rc == SLURM_SUCCESS)) { rc2 = SLURM_ERROR; for (i=0; i<gres_context_cnt; i++) { - rc2 = (*(gres_context[i].ops.job_state_validate)) - (tok, &gres_data); + rc2 = _job_state_validate(tok, &gres_data, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) continue; gres_ptr = xmalloc(sizeof(gres_state_t)); @@ -1659,6 +1755,31 @@ extern int gres_plugin_job_state_validate(char *req_config, List *gres_list) return rc; } +static void *_job_state_dup(void *gres_data) +{ + + int i; + gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data; + gres_job_state_t *new_gres_ptr; + + if (gres_ptr == NULL) + return NULL; + + new_gres_ptr = xmalloc(sizeof(gres_job_state_t)); + new_gres_ptr->gres_cnt_alloc = gres_ptr->gres_cnt_alloc; + new_gres_ptr->gres_cnt_mult = gres_ptr->gres_cnt_mult; + new_gres_ptr->node_cnt = gres_ptr->node_cnt; + new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + gres_ptr->node_cnt); + for (i=0; i<gres_ptr->node_cnt; i++) { + if (gres_ptr->gres_bit_alloc[i] == NULL) + continue; + new_gres_ptr->gres_bit_alloc[i] = bit_copy(gres_ptr-> + gres_bit_alloc[i]); + } + return new_gres_ptr; +} + /* * Create a copy of a job's gres state * IN gres_list - List of Gres records for this job to track usage @@ -1684,8 +1805,7 @@ List gres_plugin_job_state_dup(List gres_list) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - new_gres_data = (*(gres_context[i].ops.job_state_dup)) - (gres_ptr->gres_data); + new_gres_data = _job_state_dup(gres_ptr->gres_data); if (new_gres_data == NULL) break; if (new_gres_list == NULL) { @@ -1710,6 +1830,21 @@ List gres_plugin_job_state_dup(List gres_list) return new_gres_list; } +static int _job_state_pack(void *gres_data, Buf buffer) +{ + int i; + gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data; + + pack32(gres_ptr->gres_cnt_alloc, buffer); + pack8 (gres_ptr->gres_cnt_mult, buffer); + + pack32(gres_ptr->node_cnt, buffer); + for (i=0; i<gres_ptr->node_cnt; i++) + pack_bit_str(gres_ptr->gres_bit_alloc[i], buffer); + + return SLURM_SUCCESS; +} + /* * Pack a job's current gres status, called from slurmctld for save/restore * IN gres_list - generated by gres_plugin_job_config_validate() @@ -1748,8 +1883,7 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, size_offset = get_buf_offset(buffer); pack32(gres_size, buffer); /* placeholder */ data_offset = get_buf_offset(buffer); - rc2 = (*(gres_context[i].ops.job_state_pack)) - (gres_ptr->gres_data, buffer); + rc2 = _job_state_pack(gres_ptr->gres_data, buffer); if (rc2 != SLURM_SUCCESS) { rc = rc2; set_buf_offset(buffer, header_offset); @@ -1780,6 +1914,39 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, return rc; } +static int _job_state_unpack(void **gres_data, Buf buffer, char *gres_name) +{ + int i; + gres_job_state_t *gres_ptr; + + gres_ptr = xmalloc(sizeof(gres_job_state_t)); + + if (buffer) { + safe_unpack32(&gres_ptr->gres_cnt_alloc, buffer); + safe_unpack8 (&gres_ptr->gres_cnt_mult, buffer); + + safe_unpack32(&gres_ptr->node_cnt, buffer); + gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + gres_ptr->node_cnt); + for (i=0; i<gres_ptr->node_cnt; i++) + unpack_bit_str(&gres_ptr->gres_bit_alloc[i], buffer); + } + + *gres_data = gres_ptr; + return SLURM_SUCCESS; + +unpack_error: + error("Unpacking gres/%s job state info", gres_name); + if (gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) + FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]); + xfree(gres_ptr->gres_bit_alloc); + } + xfree(gres_ptr); + *gres_data = NULL; + return SLURM_ERROR; +} + /* * Unpack a job's current gres status, called from slurmctld for save/restore * OUT gres_list - restored state stored by gres_plugin_job_state_pack() @@ -1836,8 +2003,8 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer, continue; } gres_context[i].unpacked_info = true; - rc2 = (*(gres_context[i].ops.job_state_unpack)) - (&gres_data, buffer); + rc2 = _job_state_unpack(&gres_data, buffer, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) { rc = rc2; } else { @@ -1857,8 +2024,8 @@ fini: /* Insure that every gres plugin is called for unpack, even if no data debug("gres_plugin_job_state_unpack: no info packed for %s " "by job %u", gres_context[i].gres_type, job_id); - rc2 = (*(gres_context[i].ops.job_state_unpack)) - (&gres_data, NULL); + rc2 = _job_state_unpack(&gres_data, NULL, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) { rc = rc2; } else { @@ -1879,6 +2046,28 @@ unpack_error: goto fini; } +static uint32_t _job_test(void *job_gres_data, void *node_gres_data, + bool use_total_gres) +{ + uint32_t gres_avail; + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data; + + gres_avail = node_gres_ptr->gres_cnt_avail; + if (!use_total_gres) + gres_avail -= node_gres_ptr->gres_cnt_alloc; + + if (job_gres_ptr->gres_cnt_mult == 0) { + /* per node gres limit */ + if (job_gres_ptr->gres_cnt_alloc > gres_avail) + return (uint32_t) 0; + return NO_VAL; + } else { + /* per CPU gres limit */ + return (uint32_t) (gres_avail / job_gres_ptr->gres_cnt_alloc); + } +} + /* * Determine how many CPUs on the node can be used by this job * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() @@ -1925,10 +2114,9 @@ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, if (job_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - tmp_cnt = (*(gres_context[i].ops.job_test)) - (job_gres_ptr->gres_data, - node_gres_ptr->gres_data, - use_total_gres); + tmp_cnt = _job_test(job_gres_ptr->gres_data, + node_gres_ptr->gres_data, + use_total_gres); cpu_cnt = MIN(tmp_cnt, cpu_cnt); break; } @@ -1941,6 +2129,93 @@ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, return cpu_cnt; } +extern int _job_alloc(void *job_gres_data, void *node_gres_data, + int node_cnt, int node_offset, uint32_t cpu_cnt, + char *gres_name) +{ + int i; + uint32_t gres_cnt; + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data; + + /* + * Validate data structures. Either job_gres_data->node_cnt and + * job_gres_data->gres_bit_alloc are both set or both zero/NULL. + */ + xassert(node_cnt); + xassert(node_offset >= 0); + xassert(job_gres_ptr); + xassert(node_gres_ptr); + xassert(node_gres_ptr->gres_bit_alloc); + if (job_gres_ptr->node_cnt == 0) { + job_gres_ptr->node_cnt = node_cnt; + if (job_gres_ptr->gres_bit_alloc) { + error("gres/%s: node_cnt==0 and bit_alloc is set", + gres_name); + xfree(job_gres_ptr->gres_bit_alloc); + } + job_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + node_cnt); + } else if (job_gres_ptr->node_cnt < node_cnt) { + error("gres/%s: node_cnt increase from %u to %d", + gres_name, job_gres_ptr->node_cnt, node_cnt); + if (node_offset >= job_gres_ptr->node_cnt) + return SLURM_ERROR; + } else if (job_gres_ptr->node_cnt > node_cnt) { + error("gres/%s: node_cnt decrease from %u to %d", + gres_name, job_gres_ptr->node_cnt, node_cnt); + } + + /* + * Check that sufficient resources exist on this node + */ + if (job_gres_ptr->gres_cnt_mult == 0) + gres_cnt = job_gres_ptr->gres_cnt_alloc; + else + gres_cnt = (job_gres_ptr->gres_cnt_alloc * cpu_cnt); + i = node_gres_ptr->gres_cnt_alloc + gres_cnt; + i -= node_gres_ptr->gres_cnt_avail; + if (i > 0) { + error("gres/%s: overallocated resources by %d", gres_name, i); + /* proceed with request, give job what's available */ + } + + /* + * Select the specific resources to use for this job. + * We'll need to add topology information in the future + */ + if (job_gres_ptr->gres_bit_alloc[node_offset]) { + /* Resuming a suspended job, resources already allocated */ + debug("gres/%s: job's bit_alloc is already set for node %d", + gres_name, node_offset); + gres_cnt = MIN(bit_size(node_gres_ptr->gres_bit_alloc), + bit_size(job_gres_ptr-> + gres_bit_alloc[node_offset])); + for (i=0; i<gres_cnt; i++) { + if (bit_test(job_gres_ptr->gres_bit_alloc[node_offset], + i)) { + bit_set(node_gres_ptr->gres_bit_alloc, i); + node_gres_ptr->gres_cnt_alloc++; + } + } + } else { + job_gres_ptr->gres_bit_alloc[node_offset] = + bit_alloc(node_gres_ptr->gres_cnt_avail); + if (job_gres_ptr->gres_bit_alloc[node_offset] == NULL) + fatal("bit_copy: malloc failure"); + for (i=0; i<node_gres_ptr->gres_cnt_avail && gres_cnt>0; i++) { + if (bit_test(node_gres_ptr->gres_bit_alloc, i)) + continue; + bit_set(node_gres_ptr->gres_bit_alloc, i); + bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i); + node_gres_ptr->gres_cnt_alloc++; + gres_cnt--; + } + } + + return SLURM_SUCCESS; +} + /* * Allocate resource to a job and update node and job gres information * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() @@ -1983,10 +2258,10 @@ extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list, if (job_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - rc2 = (*(gres_context[i].ops.job_alloc)) - (job_gres_ptr->gres_data, + rc2 = _job_alloc(job_gres_ptr->gres_data, node_gres_ptr->gres_data, node_cnt, - node_offset, cpu_cnt); + node_offset, cpu_cnt, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) rc = rc2; break; @@ -1998,6 +2273,56 @@ extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list, return rc; } +static int _job_dealloc(void *job_gres_data, void *node_gres_data, + int node_offset, char *gres_name) +{ + int i, len; + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data; + + /* + * Validate data structures. Either job_gres_data->node_cnt and + * job_gres_data->gres_bit_alloc are both set or both zero/NULL. + */ + xassert(node_offset >= 0); + xassert(job_gres_ptr); + xassert(node_gres_ptr); + xassert(node_gres_ptr->gres_bit_alloc); + if (job_gres_ptr->node_cnt <= node_offset) { + error("gres/%s bad node_offset %d count is %u", + gres_name, node_offset, job_gres_ptr->node_cnt); + return SLURM_ERROR; + } + if (job_gres_ptr->gres_bit_alloc == NULL) { + error("gres/%s job's bitmap is NULL", gres_name); + return SLURM_ERROR; + } + if (job_gres_ptr->gres_bit_alloc[node_offset] == NULL) { + error("gres/%s: job's bitmap is empty", gres_name); + return SLURM_ERROR; + } + + len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]); + i = bit_size(node_gres_ptr->gres_bit_alloc); + if (i != len) { + error("gres/%s: job and node bitmap sizes differ (%d != %d)", + gres_name, len, i); + len = MIN(len, i); + /* proceed with request, make best effort */ + } + for (i=0; i<len; i++) { + if (!bit_test(job_gres_ptr->gres_bit_alloc[node_offset], i)) + continue; + bit_clear(node_gres_ptr->gres_bit_alloc, i); + /* NOTE: Do not clear bit from + * job_gres_ptr->gres_bit_alloc[node_offset] + * since this may only be an emulated deallocate */ + node_gres_ptr->gres_cnt_alloc--; + } + + return SLURM_SUCCESS; +} + /* * Deallocate resource from a job and update node and job gres information * IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate() @@ -2037,9 +2362,10 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list, if (job_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - rc2 = (*(gres_context[i].ops.job_dealloc)) - (job_gres_ptr->gres_data, - node_gres_ptr->gres_data, node_offset); + rc2 = _job_dealloc(job_gres_ptr->gres_data, + node_gres_ptr->gres_data, + node_offset, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) rc = rc2; break; @@ -2051,6 +2377,43 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list, return rc; } +static void _job_state_log(void *gres_data, uint32_t job_id, char *gres_name) +{ + gres_job_state_t *gres_ptr; + char *mult, tmp_str[128]; + int i; + + xassert(gres_data); + gres_ptr = (gres_job_state_t *) gres_data; + info("gres: %s state for job %u", gres_name, job_id); + if (gres_ptr->gres_cnt_mult) + mult = "cpu"; + else + mult = "node"; + info(" gres_cnt:%u per %s node_cnt:%u", gres_ptr->gres_cnt_alloc, mult, + gres_ptr->node_cnt); + + if (gres_ptr->node_cnt && gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) { + bit_fmt(tmp_str, sizeof(tmp_str), + gres_ptr->gres_bit_alloc[i]); + info(" gres_bit_alloc[%d]:%s", i, tmp_str); + } + } else { + info(" gres_bit_alloc:NULL"); + } + + if (gres_ptr->node_cnt && gres_ptr->gres_bit_step_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) { + bit_fmt(tmp_str, sizeof(tmp_str), + gres_ptr->gres_bit_step_alloc[i]); + info(" gres_bit_step_alloc[%d]:%s", i, tmp_str); + } + } else { + info(" gres_bit_step_alloc:NULL"); + } +} + /* * Log a job's current gres state * IN gres_list - generated by gres_plugin_job_state_validate() @@ -2074,8 +2437,8 @@ extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.job_state_log)) - (gres_ptr->gres_data, job_id); + _job_state_log(gres_ptr->gres_data, job_id, + gres_context[i].ops.gres_name); break; } } @@ -2083,6 +2446,22 @@ extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id) slurm_mutex_unlock(&gres_context_lock); } +static void _step_state_delete(void *gres_data) +{ + int i; + gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data; + + if (gres_ptr == NULL) + return; + + if (gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) + FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]); + xfree(gres_ptr->gres_bit_alloc); + } + xfree(gres_ptr); +} + static void _gres_step_list_delete(void *list_element) { int i; @@ -2096,13 +2475,91 @@ static void _gres_step_list_delete(void *list_element) for (i=0; i<gres_context_cnt; i++) { if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.step_state_delete))(gres_ptr->gres_data); + _step_state_delete(gres_ptr->gres_data); xfree(gres_ptr); break; } slurm_mutex_unlock(&gres_context_lock); } +static int _step_state_validate(char *config, void **gres_data, char *gres_name) +{ + int name_colon_len; + char *last = NULL, name_colon[128]; + gres_job_state_t *gres_ptr; + uint32_t cnt; + uint8_t mult = 0; + + name_colon_len = snprintf(name_colon, sizeof(name_colon), "%s:", + gres_name); + if (!strcmp(config, gres_name)) { + cnt = 1; + } else if (!strncmp(config, name_colon, name_colon_len)) { + cnt = strtol(config+name_colon_len, &last, 10); + if (last[0] == '\0') + ; + else if ((last[0] == 'k') || (last[0] == 'K')) + cnt *= 1024; + else if (!strcasecmp(last, "*cpu")) + mult = 1; + else + return SLURM_ERROR; + if (cnt == 0) + return SLURM_ERROR; + } else + return SLURM_ERROR; + + gres_ptr = xmalloc(sizeof(gres_step_state_t)); + gres_ptr->gres_cnt_alloc = cnt; + gres_ptr->gres_cnt_mult = mult; + *gres_data = gres_ptr; + return SLURM_SUCCESS; +} + +static uint32_t _step_test(void *step_gres_data, void *job_gres_data, + int node_offset, bool ignore_alloc, char *gres_name) +{ + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data; + uint32_t gres_cnt; + + xassert(job_gres_ptr); + xassert(step_gres_ptr); + + if (node_offset == NO_VAL) { + if (step_gres_ptr->gres_cnt_alloc > job_gres_ptr->gres_cnt_alloc) + return 0; + return NO_VAL; + } + + if (node_offset >= job_gres_ptr->node_cnt) { + error("gres/%s: step_test node offset invalid (%d >= %u)", + gres_name, node_offset, job_gres_ptr->node_cnt); + return 0; + } + if ((job_gres_ptr->gres_bit_alloc == NULL) || + (job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) { + error("gres/%s: step_test gres_bit_alloc is NULL", gres_name); + return 0; + } + + gres_cnt = bit_set_count(job_gres_ptr->gres_bit_alloc[node_offset]); + if (!ignore_alloc && + job_gres_ptr->gres_bit_step_alloc && + job_gres_ptr->gres_bit_step_alloc[node_offset]) { + gres_cnt -= bit_set_count(job_gres_ptr-> + gres_bit_step_alloc[node_offset]); + } + if (step_gres_ptr->gres_cnt_mult) /* Gres count per CPU */ + gres_cnt /= step_gres_ptr->gres_cnt_alloc; + else if (step_gres_ptr->gres_cnt_alloc > gres_cnt) + gres_cnt = 0; + else + gres_cnt = NO_VAL; + + return gres_cnt; +} + /* * Given a step's requested gres configuration, validate it and build gres list * IN req_config - step request's gres input string @@ -2143,8 +2600,8 @@ extern int gres_plugin_step_state_validate(char *req_config, while (tok && (rc == SLURM_SUCCESS)) { rc2 = SLURM_ERROR; for (i=0; i<gres_context_cnt; i++) { - rc2 = (*(gres_context[i].ops.step_state_validate)) - (tok, &step_gres_data); + rc2 = _step_state_validate(tok, &step_gres_data, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) continue; /* Now make sure the step's request isn't too big for @@ -2164,9 +2621,8 @@ extern int gres_plugin_step_state_validate(char *req_config, break; } job_gres_data = job_gres_ptr->gres_data; - rc3 = (*(gres_context[i].ops.step_test)) - (step_gres_data, job_gres_data, NO_VAL, - true); + rc3 = _step_test(step_gres_data, job_gres_data, NO_VAL, + true, gres_context[i].ops.gres_name); if (rc3 == 0) { info("Step gres more than in job allocation %s", tok); @@ -2194,6 +2650,31 @@ extern int gres_plugin_step_state_validate(char *req_config, return rc; } +static void *_step_state_dup(void *gres_data) +{ + + int i; + gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data; + gres_step_state_t *new_gres_ptr; + + if (gres_ptr == NULL) + return NULL; + + new_gres_ptr = xmalloc(sizeof(gres_step_state_t)); + new_gres_ptr->gres_cnt_alloc = gres_ptr->gres_cnt_alloc; + new_gres_ptr->gres_cnt_mult = gres_ptr->gres_cnt_mult; + new_gres_ptr->node_cnt = gres_ptr->node_cnt; + new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + gres_ptr->node_cnt); + for (i=0; i<gres_ptr->node_cnt; i++) { + if (gres_ptr->gres_bit_alloc[i] == NULL) + continue; + new_gres_ptr->gres_bit_alloc[i] = bit_copy(gres_ptr-> + gres_bit_alloc[i]); + } + return new_gres_ptr; +} + /* * Create a copy of a step's gres state * IN gres_list - List of Gres records for this step to track usage @@ -2219,8 +2700,7 @@ List gres_plugin_step_state_dup(List gres_list) if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - new_gres_data = (*(gres_context[i].ops.step_state_dup)) - (gres_ptr->gres_data); + new_gres_data = _step_state_dup(gres_ptr->gres_data); if (new_gres_data == NULL) break; if (new_gres_list == NULL) { @@ -2245,6 +2725,21 @@ List gres_plugin_step_state_dup(List gres_list) return new_gres_list; } +static int _step_state_pack(void *gres_data, Buf buffer) +{ + int i; + gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data; + + pack32(gres_ptr->gres_cnt_alloc, buffer); + pack8 (gres_ptr->gres_cnt_mult, buffer); + + pack32(gres_ptr->node_cnt, buffer); + for (i=0; i<gres_ptr->node_cnt; i++) + pack_bit_str(gres_ptr->gres_bit_alloc[i], buffer); + + return SLURM_SUCCESS; +} + /* * Pack a step's current gres status, called from slurmctld for save/restore * IN gres_list - generated by gres_plugin_step_allocate() @@ -2283,8 +2778,7 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, size_offset = get_buf_offset(buffer); pack32(gres_size, buffer); /* placeholder */ data_offset = get_buf_offset(buffer); - rc2 = (*(gres_context[i].ops.step_state_pack)) - (gres_ptr->gres_data, buffer); + rc2 = _step_state_pack(gres_ptr->gres_data, buffer); if (rc2 != SLURM_SUCCESS) { rc = rc2; set_buf_offset(buffer, header_offset); @@ -2315,6 +2809,39 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, return rc; } +static int _step_state_unpack(void **gres_data, Buf buffer, char *gres_name) +{ + int i; + gres_step_state_t *gres_ptr; + + gres_ptr = xmalloc(sizeof(gres_step_state_t)); + + if (buffer) { + safe_unpack32(&gres_ptr->gres_cnt_alloc, buffer); + safe_unpack8 (&gres_ptr->gres_cnt_mult, buffer); + + safe_unpack32(&gres_ptr->node_cnt, buffer); + gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + gres_ptr->node_cnt); + for (i=0; i<gres_ptr->node_cnt; i++) + unpack_bit_str(&gres_ptr->gres_bit_alloc[i], buffer); + } + + *gres_data = gres_ptr; + return SLURM_SUCCESS; + +unpack_error: + error("Unpacking gres/%s step state info", gres_name); + if (gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) + FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]); + xfree(gres_ptr->gres_bit_alloc); + } + xfree(gres_ptr); + *gres_data = NULL; + return SLURM_ERROR; +} + /* * Unpack a step's current gres status, called from slurmctld for save/restore * OUT gres_list - restored state stored by gres_plugin_step_state_pack() @@ -2372,8 +2899,8 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, continue; } gres_context[i].unpacked_info = true; - rc2 = (*(gres_context[i].ops.step_state_unpack)) - (&gres_data, buffer); + rc2 = _step_state_unpack(&gres_data, buffer, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) { rc = rc2; } else { @@ -2393,8 +2920,8 @@ fini: /* Insure that every gres plugin is called for unpack, even if no data debug("gres_plugin_job_state_unpack: no info packed for %s " "by step %u.%u", gres_context[i].gres_type, job_id, step_id); - rc2 = (*(gres_context[i].ops.job_state_unpack)) - (&gres_data, NULL); + rc2 = _step_state_unpack(&gres_data, NULL, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) { rc = rc2; } else { @@ -2415,6 +2942,33 @@ unpack_error: goto fini; } +static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id, + char *gres_name) +{ + gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data; + char *mult, tmp_str[128]; + int i; + + xassert(gres_ptr); + info("gres/%s state for step %u.%u", gres_name, job_id, step_id); + if (gres_ptr->gres_cnt_mult) + mult = "cpu"; + else + mult = "node"; + info(" gres_cnt:%u per %s node_cnt:%u", gres_ptr->gres_cnt_alloc, mult, + gres_ptr->node_cnt); + + if (gres_ptr->node_cnt && gres_ptr->gres_bit_alloc) { + for (i=0; i<gres_ptr->node_cnt; i++) { + bit_fmt(tmp_str, sizeof(tmp_str), + gres_ptr->gres_bit_alloc[i]); + info(" gres_bit_alloc[%d]:%s", i, tmp_str); + } + } else { + info(" gres_bit_alloc:NULL"); + } +} + /* * Log a step's current gres state * IN gres_list - generated by gres_plugin_step_allocate() @@ -2439,8 +2993,8 @@ extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id, if (gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - (*(gres_context[i].ops.step_state_log)) - (gres_ptr->gres_data, job_id, step_id); + _step_state_log(gres_ptr->gres_data, job_id, step_id, + gres_context[i].ops.gres_name); break; } } @@ -2493,10 +3047,10 @@ extern uint32_t gres_plugin_step_test(List step_gres_list, List job_gres_list, if (step_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - tmp_cnt = (*(gres_context[i].ops.step_test)) - (step_gres_ptr->gres_data, - job_gres_ptr->gres_data, - node_offset, ignore_alloc); + tmp_cnt = _step_test(step_gres_ptr->gres_data, + job_gres_ptr->gres_data, + node_offset, ignore_alloc, + gres_context[i].ops.gres_name); cpu_cnt = MIN(tmp_cnt, cpu_cnt); break; } @@ -2509,6 +3063,84 @@ extern uint32_t gres_plugin_step_test(List step_gres_list, List job_gres_list, return cpu_cnt; } +static int _step_alloc(void *step_gres_data, void *job_gres_data, + int node_offset, int cpu_cnt, char *gres_name) +{ + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data; + uint32_t gres_avail, gres_needed; + bitstr_t *gres_bit_alloc; + + xassert(job_gres_ptr); + xassert(step_gres_ptr); + if (node_offset >= job_gres_ptr->node_cnt) { + error("gres/%s: step_alloc node offset invalid (%d >= %u)", + gres_name, node_offset, job_gres_ptr->node_cnt); + return SLURM_ERROR; + } + if ((job_gres_ptr->gres_bit_alloc == NULL) || + (job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) { + error("gres/%s: step_alloc gres_bit_alloc is NULL", gres_name); + return SLURM_ERROR; + } + + gres_bit_alloc = bit_copy(job_gres_ptr->gres_bit_alloc[node_offset]); + if (gres_bit_alloc == NULL) + fatal("bit_copy malloc failure"); + if (job_gres_ptr->gres_bit_step_alloc && + job_gres_ptr->gres_bit_step_alloc[node_offset]) { + bit_not(job_gres_ptr->gres_bit_step_alloc[node_offset]); + bit_and(gres_bit_alloc, + job_gres_ptr->gres_bit_step_alloc[node_offset]); + bit_not(job_gres_ptr->gres_bit_step_alloc[node_offset]); + } + gres_avail = bit_set_count(gres_bit_alloc); + gres_needed = step_gres_ptr->gres_cnt_alloc; + if (step_gres_ptr->gres_cnt_mult) + gres_needed *= cpu_cnt; + if (gres_needed > gres_avail) { + error("gres/%s: step oversubscribing resources on node %d", + gres_name, node_offset); + } else { + int gres_rem = gres_needed; + int i, len = bit_size(gres_bit_alloc); + for (i=0; i<len; i++) { + if (gres_rem > 0) { + if (bit_test(gres_bit_alloc, i)) + gres_rem--; + } else { + bit_clear(gres_bit_alloc, i); + } + } + } + + if (job_gres_ptr->gres_bit_step_alloc == NULL) { + job_gres_ptr->gres_bit_step_alloc = + xmalloc(sizeof(bitstr_t *) * job_gres_ptr->node_cnt); + } + if (job_gres_ptr->gres_bit_step_alloc[node_offset]) { + bit_or(job_gres_ptr->gres_bit_step_alloc[node_offset], + gres_bit_alloc); + } else { + job_gres_ptr->gres_bit_step_alloc[node_offset] = + bit_copy(gres_bit_alloc); + } + if (step_gres_ptr->gres_bit_alloc == NULL) { + step_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * + job_gres_ptr->node_cnt); + step_gres_ptr->node_cnt = job_gres_ptr->node_cnt; + } + if (step_gres_ptr->gres_bit_alloc[node_offset]) { + error("gres/%s: step bit_alloc already exists", gres_name); + bit_or(step_gres_ptr->gres_bit_alloc[node_offset],gres_bit_alloc); + FREE_NULL_BITMAP(gres_bit_alloc); + } else { + step_gres_ptr->gres_bit_alloc[node_offset] = gres_bit_alloc; + } + + return SLURM_SUCCESS; +} + /* * Allocate resource to a step and update job and step gres information * IN step_gres_list - step's gres_list built by @@ -2549,10 +3181,10 @@ extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list, if (step_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - rc2 = (*(gres_context[i].ops.step_alloc)) - (step_gres_ptr->gres_data, - job_gres_ptr->gres_data, - node_offset, cpu_cnt); + rc2 = _step_alloc(step_gres_ptr->gres_data, + job_gres_ptr->gres_data, + node_offset, cpu_cnt, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) rc = rc2; break; @@ -2564,6 +3196,59 @@ extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list, return rc; } + +static int _step_dealloc(void *step_gres_data, void *job_gres_data, + char *gres_name) +{ + + gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data; + gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data; + uint32_t i, j, node_cnt; + int len_j, len_s; + + xassert(job_gres_ptr); + xassert(step_gres_ptr); + node_cnt = MIN(job_gres_ptr->node_cnt, step_gres_ptr->node_cnt); + if (step_gres_ptr->gres_bit_alloc == NULL) { + error("gres/%s: step dealloc bit_alloc is NULL", gres_name); + return SLURM_ERROR; + } + if (job_gres_ptr->gres_bit_alloc == NULL) { + error("gres/%s: step dealloc, job's bit_alloc is NULL", + gres_name); + return SLURM_ERROR; + } + for (i=0; i<node_cnt; i++) { + if (step_gres_ptr->gres_bit_alloc[i] == NULL) + continue; + if (job_gres_ptr->gres_bit_alloc[i] == NULL) { + error("gres/%s: step dealloc, job's bit_alloc[%d] is " + "NULL", gres_name, i); + continue; + } + len_j = bit_size(job_gres_ptr->gres_bit_alloc[i]); + len_s = bit_size(step_gres_ptr->gres_bit_alloc[i]); + if (len_j != len_s) { + error("gres/%s: step dealloc, bit_alloc[%d] size " + "mis-match (%d != %d)", + gres_name, i, len_j, len_s); + len_j = MIN(len_j, len_s); + } + for (j=0; j<len_j; j++) { + if (!bit_test(step_gres_ptr->gres_bit_alloc[i], j)) + continue; + if (job_gres_ptr->gres_bit_step_alloc && + job_gres_ptr->gres_bit_step_alloc[i]) { + bit_clear(job_gres_ptr->gres_bit_step_alloc[i], + j); + } + } + FREE_NULL_BITMAP(step_gres_ptr->gres_bit_alloc[i]); + } + + return SLURM_SUCCESS; +} + /* * Deallocate resource to a step and update job and step gres information * IN step_gres_list - step's gres_list built by @@ -2601,9 +3286,9 @@ extern int gres_plugin_step_dealloc(List step_gres_list, List job_gres_list) if (step_gres_ptr->plugin_id != *(gres_context[i].ops.plugin_id)) continue; - rc2 = (*(gres_context[i].ops.step_dealloc)) - (step_gres_ptr->gres_data, - job_gres_ptr->gres_data); + rc2 = _step_dealloc(step_gres_ptr->gres_data, + job_gres_ptr->gres_data, + gres_context[i].ops.gres_name); if (rc2 != SLURM_SUCCESS) rc = rc2; break; diff --git a/src/common/gres.h b/src/common/gres.h index 0319651831c..7cef2c1a318 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -70,6 +70,38 @@ typedef struct gres_node_state { bitstr_t *gres_bit_alloc; } gres_node_state_t; +/* Gres job state as used by slurmctld daemon */ +typedef struct gres_job_state { + /* Count of resources needed */ + uint32_t gres_cnt_alloc; + + /* If 0 then gres_cnt_alloc is per node, + * if 1 then gres_cnt_alloc is per CPU */ + uint8_t gres_cnt_mult; + + /* Resources currently allocated to job on each node */ + uint32_t node_cnt; + bitstr_t **gres_bit_alloc; + + /* Resources currently allocated to job steps on each node. + * This will be a subset of resources allocated to the job. + * gres_bit_step_alloc is a subset of gres_bit_alloc */ + bitstr_t **gres_bit_step_alloc; +} gres_job_state_t; + +/* Gres job step state as used by slurmctld daemon */ +typedef struct gres_step_state { + /* Count of resources needed */ + uint32_t gres_cnt_alloc; + + /* If 0 then gres_cnt_alloc is per node, + * if 1 then gres_cnt_alloc is per CPU */ + uint8_t gres_cnt_mult; + + /* Resources currently allocated to the job step on each node */ + uint32_t node_cnt; + bitstr_t **gres_bit_alloc; +} gres_step_state_t; /* * Initialize the gres plugin. * diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index 0501993bed7..2d9844aa080 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -101,68 +101,13 @@ * plugin_id - unique id for this plugin, value of 100+ * help_msg - response for srun --gres=help * plugin_version - specifies the version number of the plugin. - * min_plug_version - specifies the minumum version number of incoming - * messages that this plugin can accept */ const char plugin_name[] = "Gres GPU plugin"; const char plugin_type[] = "gres/gpu"; const uint32_t plugin_id = 101; const char gres_name[] = "gpu"; const char help_msg[] = "gpu[:count[*cpu]]"; - const uint32_t plugin_version = 100; -const uint32_t min_plug_version = 100; - -/* Gres node state as used by slurmctld. Includes data from gres_config loaded - * from slurmd, resources configured (may be more or less than actually found) - * plus resource allocation information. */ -typedef struct gpu_node_state { - /* Actual hardware found */ - uint32_t gpu_cnt_found; - - /* Configured resources via Gres parameter */ - uint32_t gpu_cnt_config; - - /* Total resources available for allocation to jobs */ - uint32_t gpu_cnt_avail; - - /* Resources currently allocated to jobs */ - uint32_t gpu_cnt_alloc; - bitstr_t *gpu_bit_alloc; -} gpu_node_state_t; - -/* Gres job state as used by slurmctld. */ -typedef struct gpu_job_state { - /* Count of resources needed */ - uint32_t gpu_cnt_alloc; - - /* If 0 then gpu_cnt_alloc is per node, - * if 1 then gpu_cnt_alloc is per CPU */ - uint8_t gpu_cnt_mult; - - /* Resources currently allocated to job on each node */ - uint32_t node_cnt; - bitstr_t **gpu_bit_alloc; - - /* Resources currently allocated to job steps on each node. - * This will be a subset of resources allocated to the job. - * gpu_bit_step_alloc is a subset of gpu_bit_alloc */ - bitstr_t **gpu_bit_step_alloc; -} gpu_job_state_t; - -/* Gres job step state as used by slurmctld. */ -typedef struct gpu_step_state { - /* Count of resources needed */ - uint32_t gpu_cnt_alloc; - - /* If 0 then gpu_cnt_alloc is per node, - * if 1 then gpu_cnt_alloc is per CPU */ - uint8_t gpu_cnt_mult; - - /* Resources currently allocated to the job step on each node */ - uint32_t node_cnt; - bitstr_t **gpu_bit_alloc; -} gpu_step_state_t; /* * We could load gres state or validate it using various mechanisms here. @@ -190,753 +135,3 @@ extern int node_config_load(List gres_conf_list) fatal("%s failed to load configuration", plugin_name); return rc; } - -extern void *node_state_dup(void *gres_data) -{ - gpu_node_state_t *gres_ptr = (gpu_node_state_t *) gres_data; - gpu_node_state_t *new_gres; - - if (gres_ptr == NULL) - return NULL; - - new_gres = xmalloc(sizeof(gpu_node_state_t)); - new_gres->gpu_cnt_found = gres_ptr->gpu_cnt_found; - new_gres->gpu_cnt_config = gres_ptr->gpu_cnt_config; - new_gres->gpu_cnt_avail = gres_ptr->gpu_cnt_avail; - new_gres->gpu_cnt_alloc = gres_ptr->gpu_cnt_alloc; - new_gres->gpu_bit_alloc = bit_copy(gres_ptr->gpu_bit_alloc); - - return new_gres; -} - -extern void node_state_dealloc(void *gres_data) -{ - gpu_node_state_t *gres_ptr = (gpu_node_state_t *) gres_data; - - gres_ptr->gpu_cnt_alloc = 0; - if (gres_ptr->gpu_bit_alloc) { - int i = bit_size(gres_ptr->gpu_bit_alloc) - 1; - if (i > 0) - bit_nclear(gres_ptr->gpu_bit_alloc, 0, i); - } -} - -extern int node_state_realloc(void *job_gres_data, int node_offset, - void *node_gres_data) -{ - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_node_state_t *node_gres_ptr = (gpu_node_state_t *) node_gres_data; - int i, job_bit_size, node_bit_size; - - xassert(job_gres_ptr); - xassert(node_gres_ptr); - - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s job node offset is bad (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return EINVAL; - } - - if ((job_gres_ptr->gpu_bit_alloc == NULL) || - (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL)) { - error("%s job bit_alloc is NULL", plugin_name); - return EINVAL; - } - - if (node_gres_ptr->gpu_bit_alloc == NULL) { - error("%s node bit_alloc is NULL", plugin_name); - return EINVAL; - } - - job_bit_size = bit_size(job_gres_ptr->gpu_bit_alloc[node_offset]); - node_bit_size = bit_size(node_gres_ptr->gpu_bit_alloc); - if (job_bit_size > node_bit_size) { - error("%s job/node bit size mismatch (%d != %d)", - plugin_name, job_bit_size, node_bit_size); - /* Node needs to register with more resources, expand - * node's bitmap now so we can merge the data */ - node_gres_ptr->gpu_bit_alloc = - bit_realloc(node_gres_ptr->gpu_bit_alloc, - job_bit_size); - if (node_gres_ptr->gpu_bit_alloc == NULL) - fatal("bit_realloc: malloc failure"); - node_bit_size = job_bit_size; - } - if (job_bit_size < node_bit_size) { - error("%s job/node bit size mismatch (%d != %d)", - plugin_name, job_bit_size, node_bit_size); - /* Update what we can */ - node_bit_size = MIN(job_bit_size, node_bit_size); - for (i=0; i<node_bit_size; i++) { - if (!bit_test(job_gres_ptr->gpu_bit_alloc[node_offset], - i)) - continue; - node_gres_ptr->gpu_cnt_alloc++; - bit_set(node_gres_ptr->gpu_bit_alloc, i); - } - } else { - node_gres_ptr->gpu_cnt_alloc += bit_set_count(job_gres_ptr-> - gpu_bit_alloc - [node_offset]); - bit_or(node_gres_ptr->gpu_bit_alloc, - job_gres_ptr->gpu_bit_alloc[node_offset]); - } - - return SLURM_SUCCESS; -} - -extern void node_state_log(void *gres_data, char *node_name) -{ - gpu_node_state_t *gres_ptr; - - xassert(gres_data); - gres_ptr = (gpu_node_state_t *) gres_data; - info("%s state for %s", plugin_name, node_name); - info(" gpu_cnt found:%u configured:%u avail:%u alloc:%u", - gres_ptr->gpu_cnt_found, gres_ptr->gpu_cnt_config, - gres_ptr->gpu_cnt_avail, gres_ptr->gpu_cnt_alloc); - if (gres_ptr->gpu_bit_alloc) { - char tmp_str[128]; - bit_fmt(tmp_str, sizeof(tmp_str), gres_ptr->gpu_bit_alloc); - info(" gpu_bit_alloc:%s", tmp_str); - } else { - info(" gpu_bit_alloc:NULL"); - } -} - -extern int job_state_validate(char *config, void **gres_data) -{ - char *last = NULL; - gpu_job_state_t *gres_ptr; - uint32_t cnt; - uint8_t mult = 0; - - if (!strcmp(config, "gpu")) { - cnt = 1; - } else if (!strncmp(config, "gpu:", 4)) { - cnt = strtol(config+4, &last, 10); - if (last[0] == '\0') - ; - else if ((last[0] == 'k') || (last[0] == 'K')) - cnt *= 1024; - else if (!strcasecmp(last, "*cpu")) - mult = 1; - else - return SLURM_ERROR; - if (cnt == 0) - return SLURM_ERROR; - } else - return SLURM_ERROR; - - gres_ptr = xmalloc(sizeof(gpu_job_state_t)); - gres_ptr->gpu_cnt_alloc = cnt; - gres_ptr->gpu_cnt_mult = mult; - *gres_data = gres_ptr; - return SLURM_SUCCESS; -} - -extern void job_state_delete(void *gres_data) -{ - int i; - gpu_job_state_t *gres_ptr = (gpu_job_state_t *) gres_data; - - if (gres_ptr == NULL) - return; - - if (gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gpu_bit_alloc[i]); - xfree(gres_ptr->gpu_bit_alloc); - } - if (gres_ptr->gpu_bit_step_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gpu_bit_step_alloc[i]); - xfree(gres_ptr->gpu_bit_step_alloc); - } - xfree(gres_ptr); -} - -extern void *job_state_dup(void *gres_data) -{ - - int i; - gpu_job_state_t *gres_ptr = (gpu_job_state_t *) gres_data; - gpu_job_state_t *new_gres_ptr; - - if (gres_ptr == NULL) - return NULL; - - new_gres_ptr = xmalloc(sizeof(gpu_job_state_t)); - new_gres_ptr->gpu_cnt_alloc = gres_ptr->gpu_cnt_alloc; - new_gres_ptr->gpu_cnt_mult = gres_ptr->gpu_cnt_mult; - new_gres_ptr->node_cnt = gres_ptr->node_cnt; - new_gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) { - if (gres_ptr->gpu_bit_alloc[i] == NULL) - continue; - new_gres_ptr->gpu_bit_alloc[i] = bit_copy(gres_ptr-> - gpu_bit_alloc[i]); - } - return new_gres_ptr; -} - -extern int job_state_pack(void *gres_data, Buf buffer) -{ - int i; - gpu_job_state_t *gres_ptr = (gpu_job_state_t *) gres_data; - - pack32(gres_ptr->gpu_cnt_alloc, buffer); - pack8 (gres_ptr->gpu_cnt_mult, buffer); - - pack32(gres_ptr->node_cnt, buffer); - for (i=0; i<gres_ptr->node_cnt; i++) - pack_bit_str(gres_ptr->gpu_bit_alloc[i], buffer); - - return SLURM_SUCCESS; -} - -extern int job_state_unpack(void **gres_data, Buf buffer) -{ - int i; - gpu_job_state_t *gres_ptr; - - gres_ptr = xmalloc(sizeof(gpu_job_state_t)); - - if (buffer) { - safe_unpack32(&gres_ptr->gpu_cnt_alloc, buffer); - safe_unpack8 (&gres_ptr->gpu_cnt_mult, buffer); - - safe_unpack32(&gres_ptr->node_cnt, buffer); - gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) - unpack_bit_str(&gres_ptr->gpu_bit_alloc[i], buffer); - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - error("Unpacking %s job state info", plugin_name); - if (gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gpu_bit_alloc[i]); - xfree(gres_ptr->gpu_bit_alloc); - } - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - -extern uint32_t job_test(void *job_gres_data, void *node_gres_data, - bool use_total_gres) -{ - uint32_t gres_avail; - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_node_state_t *node_gres_ptr = (gpu_node_state_t *) node_gres_data; - - gres_avail = node_gres_ptr->gpu_cnt_avail; - if (!use_total_gres) - gres_avail -= node_gres_ptr->gpu_cnt_alloc; - - if (job_gres_ptr->gpu_cnt_mult == 0) { - /* per node gres limit */ - if (job_gres_ptr->gpu_cnt_alloc > gres_avail) - return (uint32_t) 0; - return NO_VAL; - } else { - /* per CPU gres limit */ - return (uint32_t) (gres_avail / job_gres_ptr->gpu_cnt_alloc); - } -} - -extern int job_alloc(void *job_gres_data, void *node_gres_data, - int node_cnt, int node_offset, uint32_t cpu_cnt) -{ - int i; - uint32_t gres_cnt; - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_node_state_t *node_gres_ptr = (gpu_node_state_t *) node_gres_data; - - /* - * Validate data structures. Either job_gres_data->node_cnt and - * job_gres_data->gpu_bit_alloc are both set or both zero/NULL. - */ - xassert(node_cnt); - xassert(node_offset >= 0); - xassert(job_gres_ptr); - xassert(node_gres_ptr); - xassert(node_gres_ptr->gpu_bit_alloc); - if (job_gres_ptr->node_cnt == 0) { - job_gres_ptr->node_cnt = node_cnt; - if (job_gres_ptr->gpu_bit_alloc) { - error("%s: node_cnt==0 and bit_alloc is set", - plugin_name); - xfree(job_gres_ptr->gpu_bit_alloc); - } - job_gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - node_cnt); - } else if (job_gres_ptr->node_cnt < node_cnt) { - error("%s: node_cnt increase from %u to %d", - plugin_name, job_gres_ptr->node_cnt, node_cnt); - if (node_offset >= job_gres_ptr->node_cnt) - return SLURM_ERROR; - } else if (job_gres_ptr->node_cnt > node_cnt) { - error("%s: node_cnt decrease from %u to %d", - plugin_name, job_gres_ptr->node_cnt, node_cnt); - } - - /* - * Check that sufficient resources exist on this node - */ - if (job_gres_ptr->gpu_cnt_mult == 0) - gres_cnt = job_gres_ptr->gpu_cnt_alloc; - else - gres_cnt = (job_gres_ptr->gpu_cnt_alloc * cpu_cnt); - i = node_gres_ptr->gpu_cnt_alloc + gres_cnt; - i -= node_gres_ptr->gpu_cnt_avail; - if (i > 0) { - error("%s: overallocated resources by %d", plugin_name, i); - /* proceed with request, give job what's available */ - } - - /* - * Select the specific resources to use for this job. - * We'll need to add topology information in the future - */ - if (job_gres_ptr->gpu_bit_alloc[node_offset]) { - /* Resuming a suspended job, resources already allocated */ - debug("%s: job's bit_alloc is already set for node %d", - plugin_name, node_offset); - gres_cnt = MIN(bit_size(node_gres_ptr->gpu_bit_alloc), - bit_size(job_gres_ptr-> - gpu_bit_alloc[node_offset])); - for (i=0; i<gres_cnt; i++) { - if (bit_test(job_gres_ptr->gpu_bit_alloc[node_offset], - i)) { - bit_set(node_gres_ptr->gpu_bit_alloc, i); - node_gres_ptr->gpu_cnt_alloc++; - } - } - } else { - job_gres_ptr->gpu_bit_alloc[node_offset] = - bit_alloc(node_gres_ptr->gpu_cnt_avail); - if (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL) - fatal("bit_copy: malloc failure"); - for (i=0; i<node_gres_ptr->gpu_cnt_avail && gres_cnt>0; i++) { - if (bit_test(node_gres_ptr->gpu_bit_alloc, i)) - continue; - bit_set(node_gres_ptr->gpu_bit_alloc, i); - bit_set(job_gres_ptr->gpu_bit_alloc[node_offset], i); - node_gres_ptr->gpu_cnt_alloc++; - gres_cnt--; - } - } - - return SLURM_SUCCESS; -} - -extern int job_dealloc(void *job_gres_data, void *node_gres_data, - int node_offset) -{ - int i, len; - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_node_state_t *node_gres_ptr = (gpu_node_state_t *) node_gres_data; - - /* - * Validate data structures. Either job_gres_data->node_cnt and - * job_gres_data->gpu_bit_alloc are both set or both zero/NULL. - */ - xassert(node_offset >= 0); - xassert(job_gres_ptr); - xassert(node_gres_ptr); - xassert(node_gres_ptr->gpu_bit_alloc); - if (job_gres_ptr->node_cnt <= node_offset) { - error("%s: bad node_offset %d count is %u", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return SLURM_ERROR; - } - if (job_gres_ptr->gpu_bit_alloc == NULL) { - error("%s: job's bitmap is NULL", plugin_name); - return SLURM_ERROR; - } - if (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL) { - error("%s: job's bitmap is empty", plugin_name); - return SLURM_ERROR; - } - - len = bit_size(job_gres_ptr->gpu_bit_alloc[node_offset]); - i = bit_size(node_gres_ptr->gpu_bit_alloc); - if (i != len) { - error("%s: job and node bitmap sizes differ (%d != %d)", - plugin_name, len, i); - len = MIN(len, i); - /* proceed with request, make best effort */ - } - for (i=0; i<len; i++) { - if (!bit_test(job_gres_ptr->gpu_bit_alloc[node_offset], i)) - continue; - bit_clear(node_gres_ptr->gpu_bit_alloc, i); - /* NOTE: Do not clear bit from - * job_gres_ptr->gpu_bit_alloc[node_offset] - * since this may only be an emulated deallocate */ - node_gres_ptr->gpu_cnt_alloc--; - } - - return SLURM_SUCCESS; -} - -extern void job_state_log(void *gres_data, uint32_t job_id) -{ - gpu_job_state_t *gres_ptr; - char *mult, tmp_str[128]; - int i; - - xassert(gres_data); - gres_ptr = (gpu_job_state_t *) gres_data; - info("%s state for job %u", plugin_name, job_id); - if (gres_ptr->gpu_cnt_mult) - mult = "cpu"; - else - mult = "node"; - info(" gpu_cnt:%u per %s node_cnt:%u", gres_ptr->gpu_cnt_alloc, mult, - gres_ptr->node_cnt); - - if (gres_ptr->node_cnt && gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->gpu_bit_alloc[i]); - info(" gpu_bit_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" gpu_bit_alloc:NULL"); - } - - if (gres_ptr->node_cnt && gres_ptr->gpu_bit_step_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->gpu_bit_step_alloc[i]); - info(" gpu_bit_step_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" gpu_bit_step_alloc:NULL"); - } -} - -extern void step_state_delete(void *gres_data) -{ - int i; - gpu_step_state_t *gres_ptr = (gpu_step_state_t *) gres_data; - - if (gres_ptr == NULL) - return; - - if (gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gpu_bit_alloc[i]); - xfree(gres_ptr->gpu_bit_alloc); - } - xfree(gres_ptr); -} - -extern int step_state_validate(char *config, void **gres_data) -{ - char *last = NULL; - gpu_job_state_t *gres_ptr; - uint32_t cnt; - uint8_t mult = 0; - - if (!strcmp(config, "gpu")) { - cnt = 1; - } else if (!strncmp(config, "gpu:", 4)) { - cnt = strtol(config+4, &last, 10); - if (last[0] == '\0') - ; - else if ((last[0] == 'k') || (last[0] == 'K')) - cnt *= 1024; - else if (!strcasecmp(last, "*cpu")) - mult = 1; - else - return SLURM_ERROR; - if (cnt == 0) - return SLURM_ERROR; - } else - return SLURM_ERROR; - - gres_ptr = xmalloc(sizeof(gpu_step_state_t)); - gres_ptr->gpu_cnt_alloc = cnt; - gres_ptr->gpu_cnt_mult = mult; - *gres_data = gres_ptr; - return SLURM_SUCCESS; -} - -extern void *step_state_dup(void *gres_data) -{ - - int i; - gpu_step_state_t *gres_ptr = (gpu_step_state_t *) gres_data; - gpu_step_state_t *new_gres_ptr; - - if (gres_ptr == NULL) - return NULL; - - new_gres_ptr = xmalloc(sizeof(gpu_step_state_t)); - new_gres_ptr->gpu_cnt_alloc = gres_ptr->gpu_cnt_alloc; - new_gres_ptr->gpu_cnt_mult = gres_ptr->gpu_cnt_mult; - new_gres_ptr->node_cnt = gres_ptr->node_cnt; - new_gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) { - if (gres_ptr->gpu_bit_alloc[i] == NULL) - continue; - new_gres_ptr->gpu_bit_alloc[i] = bit_copy(gres_ptr-> - gpu_bit_alloc[i]); - } - return new_gres_ptr; -} - -extern int step_state_pack(void *gres_data, Buf buffer) -{ - int i; - gpu_step_state_t *gres_ptr = (gpu_step_state_t *) gres_data; - - pack32(gres_ptr->gpu_cnt_alloc, buffer); - pack8 (gres_ptr->gpu_cnt_mult, buffer); - - pack32(gres_ptr->node_cnt, buffer); - for (i=0; i<gres_ptr->node_cnt; i++) - pack_bit_str(gres_ptr->gpu_bit_alloc[i], buffer); - - return SLURM_SUCCESS; -} - -extern int step_state_unpack(void **gres_data, Buf buffer) -{ - int i; - gpu_step_state_t *gres_ptr; - - gres_ptr = xmalloc(sizeof(gpu_step_state_t)); - - if (buffer) { - safe_unpack32(&gres_ptr->gpu_cnt_alloc, buffer); - safe_unpack8 (&gres_ptr->gpu_cnt_mult, buffer); - - safe_unpack32(&gres_ptr->node_cnt, buffer); - gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) - unpack_bit_str(&gres_ptr->gpu_bit_alloc[i], buffer); - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - error("Unpacking %s step state info", plugin_name); - if (gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gpu_bit_alloc[i]); - xfree(gres_ptr->gpu_bit_alloc); - } - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - -extern void step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id) -{ - gpu_step_state_t *gres_ptr = (gpu_step_state_t *) gres_data; - char *mult, tmp_str[128]; - int i; - - xassert(gres_ptr); - info("%s state for step %u.%u", plugin_name, job_id, step_id); - if (gres_ptr->gpu_cnt_mult) - mult = "cpu"; - else - mult = "node"; - info(" gpu_cnt:%u per %s node_cnt:%u", gres_ptr->gpu_cnt_alloc, mult, - gres_ptr->node_cnt); - - if (gres_ptr->node_cnt && gres_ptr->gpu_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->gpu_bit_alloc[i]); - info(" gpu_bit_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" gpu_bit_alloc:NULL"); - } -} - -extern uint32_t step_test(void *step_gres_data, void *job_gres_data, - int node_offset, bool ignore_alloc) -{ - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_step_state_t *step_gres_ptr = (gpu_step_state_t *) step_gres_data; - uint32_t gres_cnt; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - - if (node_offset == NO_VAL) { - if (step_gres_ptr->gpu_cnt_alloc > job_gres_ptr->gpu_cnt_alloc) - return 0; - return NO_VAL; - } - - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s step_test node offset invalid (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return 0; - } - if ((job_gres_ptr->gpu_bit_alloc == NULL) || - (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL)) { - error("%s step_test gpu_bit_alloc is NULL", plugin_name); - return 0; - } - - gres_cnt = bit_set_count(job_gres_ptr->gpu_bit_alloc[node_offset]); - if (!ignore_alloc && - job_gres_ptr->gpu_bit_step_alloc && - job_gres_ptr->gpu_bit_step_alloc[node_offset]) { - gres_cnt -= bit_set_count(job_gres_ptr-> - gpu_bit_step_alloc[node_offset]); - } - if (step_gres_ptr->gpu_cnt_mult) /* Gres count per CPU */ - gres_cnt /= step_gres_ptr->gpu_cnt_alloc; - else if (step_gres_ptr->gpu_cnt_alloc > gres_cnt) - gres_cnt = 0; - else - gres_cnt = NO_VAL; - - return gres_cnt; -} - -extern int step_alloc(void *step_gres_data, void *job_gres_data, - int node_offset, int cpu_cnt) -{ - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_step_state_t *step_gres_ptr = (gpu_step_state_t *) step_gres_data; - uint32_t gres_avail, gres_needed; - bitstr_t *gpu_bit_alloc; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s step_alloc node offset invalid (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return SLURM_ERROR; - } - if ((job_gres_ptr->gpu_bit_alloc == NULL) || - (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL)) { - error("%s step_alloc gpu_bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - - gpu_bit_alloc = bit_copy(job_gres_ptr->gpu_bit_alloc[node_offset]); - if (gpu_bit_alloc == NULL) - fatal("bit_copy malloc failure"); - if (job_gres_ptr->gpu_bit_step_alloc && - job_gres_ptr->gpu_bit_step_alloc[node_offset]) { - bit_not(job_gres_ptr->gpu_bit_step_alloc[node_offset]); - bit_and(gpu_bit_alloc, - job_gres_ptr->gpu_bit_step_alloc[node_offset]); - bit_not(job_gres_ptr->gpu_bit_step_alloc[node_offset]); - } - gres_avail = bit_set_count(gpu_bit_alloc); - gres_needed = step_gres_ptr->gpu_cnt_alloc; - if (step_gres_ptr->gpu_cnt_mult) - gres_needed *= cpu_cnt; - if (gres_needed > gres_avail) { - error("%s step oversubscribing resources on node %d", - plugin_name, node_offset); - } else { - int gres_rem = gres_needed; - int i, len = bit_size(gpu_bit_alloc); - for (i=0; i<len; i++) { - if (gres_rem > 0) { - if (bit_test(gpu_bit_alloc, i)) - gres_rem--; - } else { - bit_clear(gpu_bit_alloc, i); - } - } - } - - if (job_gres_ptr->gpu_bit_step_alloc == NULL) { - job_gres_ptr->gpu_bit_step_alloc = - xmalloc(sizeof(bitstr_t *) * job_gres_ptr->node_cnt); - } - if (job_gres_ptr->gpu_bit_step_alloc[node_offset]) { - bit_or(job_gres_ptr->gpu_bit_step_alloc[node_offset], - gpu_bit_alloc); - } else { - job_gres_ptr->gpu_bit_step_alloc[node_offset] = - bit_copy(gpu_bit_alloc); - } - if (step_gres_ptr->gpu_bit_alloc == NULL) { - step_gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) * - job_gres_ptr->node_cnt); - step_gres_ptr->node_cnt = job_gres_ptr->node_cnt; - } - if (step_gres_ptr->gpu_bit_alloc[node_offset]) { - error("%s step bit_alloc already exists", plugin_name); - bit_or(step_gres_ptr->gpu_bit_alloc[node_offset],gpu_bit_alloc); - FREE_NULL_BITMAP(gpu_bit_alloc); - } else { - step_gres_ptr->gpu_bit_alloc[node_offset] = gpu_bit_alloc; - } - - return SLURM_SUCCESS; -} - -extern int step_dealloc(void *step_gres_data, void *job_gres_data) -{ - - gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; - gpu_step_state_t *step_gres_ptr = (gpu_step_state_t *) step_gres_data; - uint32_t i, j, node_cnt; - int len_j, len_s; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - node_cnt = MIN(job_gres_ptr->node_cnt, step_gres_ptr->node_cnt); - if (step_gres_ptr->gpu_bit_alloc == NULL) { - error("%s step dealloc bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - if (job_gres_ptr->gpu_bit_alloc == NULL) { - error("%s step dealloc, job's bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - for (i=0; i<node_cnt; i++) { - if (step_gres_ptr->gpu_bit_alloc[i] == NULL) - continue; - if (job_gres_ptr->gpu_bit_alloc[i] == NULL) { - error("%s step dealloc, job's bit_alloc[%d] is NULL", - plugin_name, i); - continue; - } - len_j = bit_size(job_gres_ptr->gpu_bit_alloc[i]); - len_s = bit_size(step_gres_ptr->gpu_bit_alloc[i]); - if (len_j != len_s) { - error("%s step dealloc, bit_alloc[%d] size mis-match" - "(%d != %d)", len_j, len_s); - len_j = MIN(len_j, len_s); - } - for (j=0; j<len_j; j++) { - if (!bit_test(step_gres_ptr->gpu_bit_alloc[i], j)) - continue; - if (job_gres_ptr->gpu_bit_step_alloc && - job_gres_ptr->gpu_bit_step_alloc[i]) { - bit_clear(job_gres_ptr->gpu_bit_step_alloc[i], - j); - } - } - FREE_NULL_BITMAP(step_gres_ptr->gpu_bit_alloc[i]); - } - - return SLURM_SUCCESS; -} diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c index 8e7e6a897fa..244ab2a07a4 100644 --- a/src/plugins/gres/nic/gres_nic.c +++ b/src/plugins/gres/nic/gres_nic.c @@ -101,68 +101,13 @@ * plugin_id - unique id for this plugin, value of 100+ * help_msg - response for srun --gres=help * plugin_version - specifies the version number of the plugin. - * min_plug_version - specifies the minumum version number of incoming - * messages that this plugin can accept */ const char plugin_name[] = "Gres NIC plugin"; const char plugin_type[] = "gres/nic"; const uint32_t plugin_id = 102; const char gres_name[] = "nic"; const char help_msg[] = "nic[:count[*cpu]]"; - const uint32_t plugin_version = 100; -const uint32_t min_plug_version = 100; - -/* Gres node state as used by slurmctld. Includes data from gres_config loaded - * from slurmd, resources configured (may be more or less than actually found) - * plus resource allocation information. */ -typedef struct nic_node_state { - /* Actual hardware found */ - uint32_t nic_cnt_found; - - /* Configured resources via Gres parameter */ - uint32_t nic_cnt_config; - - /* Total resources available for allocation to jobs */ - uint32_t nic_cnt_avail; - - /* Resources currently allocated to jobs */ - uint32_t nic_cnt_alloc; - bitstr_t *nic_bit_alloc; -} nic_node_state_t; - -/* Gres job state as used by slurmctld. */ -typedef struct nic_job_state { - /* Count of resources needed */ - uint32_t nic_cnt_alloc; - - /* If 0 then nic_cnt_alloc is per node, - * if 1 then nic_cnt_alloc is per CPU */ - uint8_t nic_cnt_mult; - - /* Resources currently allocated to job on each node */ - uint32_t node_cnt; - bitstr_t **nic_bit_alloc; - - /* Resources currently allocated to job steps on each node. - * This will be a subset of resources allocated to the job. - * nic_bit_step_alloc is a subset of nic_bit_alloc */ - bitstr_t **nic_bit_step_alloc; -} nic_job_state_t; - -/* Gres job step state as used by slurmctld. */ -typedef struct nic_step_state { - /* Count of resources needed */ - uint32_t nic_cnt_alloc; - - /* If 0 then nic_cnt_alloc is per node, - * if 1 then nic_cnt_alloc is per CPU */ - uint8_t nic_cnt_mult; - - /* Resources currently allocated to the job step on each node */ - uint32_t node_cnt; - bitstr_t **nic_bit_alloc; -} nic_step_state_t; /* * We could load gres state or validate it using various mechanisms here. @@ -190,753 +135,3 @@ extern int node_config_load(List gres_conf_list) fatal("%s failed to load configuration", plugin_name); return rc; } - -extern void *node_state_dup(void *gres_data) -{ - nic_node_state_t *gres_ptr = (nic_node_state_t *) gres_data; - nic_node_state_t *new_gres; - - if (gres_ptr == NULL) - return NULL; - - new_gres = xmalloc(sizeof(nic_node_state_t)); - new_gres->nic_cnt_found = gres_ptr->nic_cnt_found; - new_gres->nic_cnt_config = gres_ptr->nic_cnt_config; - new_gres->nic_cnt_avail = gres_ptr->nic_cnt_avail; - new_gres->nic_cnt_alloc = gres_ptr->nic_cnt_alloc; - new_gres->nic_bit_alloc = bit_copy(gres_ptr->nic_bit_alloc); - - return new_gres; -} - -extern void node_state_dealloc(void *gres_data) -{ - nic_node_state_t *gres_ptr = (nic_node_state_t *) gres_data; - - gres_ptr->nic_cnt_alloc = 0; - if (gres_ptr->nic_bit_alloc) { - int i = bit_size(gres_ptr->nic_bit_alloc) - 1; - if (i > 0) - bit_nclear(gres_ptr->nic_bit_alloc, 0, i); - } -} - -extern int node_state_realloc(void *job_gres_data, int node_offset, - void *node_gres_data) -{ - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_node_state_t *node_gres_ptr = (nic_node_state_t *) node_gres_data; - int i, job_bit_size, node_bit_size; - - xassert(job_gres_ptr); - xassert(node_gres_ptr); - - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s job node offset is bad (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return EINVAL; - } - - if ((job_gres_ptr->nic_bit_alloc == NULL) || - (job_gres_ptr->nic_bit_alloc[node_offset] == NULL)) { - error("%s job bit_alloc is NULL", plugin_name); - return EINVAL; - } - - if (node_gres_ptr->nic_bit_alloc == NULL) { - error("%s node bit_alloc is NULL", plugin_name); - return EINVAL; - } - - job_bit_size = bit_size(job_gres_ptr->nic_bit_alloc[node_offset]); - node_bit_size = bit_size(node_gres_ptr->nic_bit_alloc); - if (job_bit_size > node_bit_size) { - error("%s job/node bit size mismatch (%d != %d)", - plugin_name, job_bit_size, node_bit_size); - /* Node needs to register with more resources, expand - * node's bitmap now so we can merge the data */ - node_gres_ptr->nic_bit_alloc = - bit_realloc(node_gres_ptr->nic_bit_alloc, - job_bit_size); - if (node_gres_ptr->nic_bit_alloc == NULL) - fatal("bit_realloc: malloc failure"); - node_bit_size = job_bit_size; - } - if (job_bit_size < node_bit_size) { - error("%s job/node bit size mismatch (%d != %d)", - plugin_name, job_bit_size, node_bit_size); - /* Update what we can */ - node_bit_size = MIN(job_bit_size, node_bit_size); - for (i=0; i<node_bit_size; i++) { - if (!bit_test(job_gres_ptr->nic_bit_alloc[node_offset], - i)) - continue; - node_gres_ptr->nic_cnt_alloc++; - bit_set(node_gres_ptr->nic_bit_alloc, i); - } - } else { - node_gres_ptr->nic_cnt_alloc += bit_set_count(job_gres_ptr-> - nic_bit_alloc - [node_offset]); - bit_or(node_gres_ptr->nic_bit_alloc, - job_gres_ptr->nic_bit_alloc[node_offset]); - } - - return SLURM_SUCCESS; -} - -extern void node_state_log(void *gres_data, char *node_name) -{ - nic_node_state_t *gres_ptr; - - xassert(gres_data); - gres_ptr = (nic_node_state_t *) gres_data; - info("%s state for %s", plugin_name, node_name); - info(" nic_cnt found:%u configured:%u avail:%u alloc:%u", - gres_ptr->nic_cnt_found, gres_ptr->nic_cnt_config, - gres_ptr->nic_cnt_avail, gres_ptr->nic_cnt_alloc); - if (gres_ptr->nic_bit_alloc) { - char tmp_str[128]; - bit_fmt(tmp_str, sizeof(tmp_str), gres_ptr->nic_bit_alloc); - info(" nic_bit_alloc:%s", tmp_str); - } else { - info(" nic_bit_alloc:NULL"); - } -} - -extern int job_state_validate(char *config, void **gres_data) -{ - char *last = NULL; - nic_job_state_t *gres_ptr; - uint32_t cnt; - uint8_t mult = 0; - - if (!strcmp(config, "nic")) { - cnt = 1; - } else if (!strncmp(config, "nic:", 4)) { - cnt = strtol(config+4, &last, 10); - if (last[0] == '\0') - ; - else if ((last[0] == 'k') || (last[0] == 'K')) - cnt *= 1024; - else if (!strcasecmp(last, "*cpu")) - mult = 1; - else - return SLURM_ERROR; - if (cnt == 0) - return SLURM_ERROR; - } else - return SLURM_ERROR; - - gres_ptr = xmalloc(sizeof(nic_job_state_t)); - gres_ptr->nic_cnt_alloc = cnt; - gres_ptr->nic_cnt_mult = mult; - *gres_data = gres_ptr; - return SLURM_SUCCESS; -} - -extern void job_state_delete(void *gres_data) -{ - int i; - nic_job_state_t *gres_ptr = (nic_job_state_t *) gres_data; - - if (gres_ptr == NULL) - return; - - if (gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->nic_bit_alloc[i]); - xfree(gres_ptr->nic_bit_alloc); - } - if (gres_ptr->nic_bit_step_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->nic_bit_step_alloc[i]); - xfree(gres_ptr->nic_bit_step_alloc); - } - xfree(gres_ptr); -} - -extern void *job_state_dup(void *gres_data) -{ - - int i; - nic_job_state_t *gres_ptr = (nic_job_state_t *) gres_data; - nic_job_state_t *new_gres_ptr; - - if (gres_ptr == NULL) - return NULL; - - new_gres_ptr = xmalloc(sizeof(nic_job_state_t)); - new_gres_ptr->nic_cnt_alloc = gres_ptr->nic_cnt_alloc; - new_gres_ptr->nic_cnt_mult = gres_ptr->nic_cnt_mult; - new_gres_ptr->node_cnt = gres_ptr->node_cnt; - new_gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) { - if (gres_ptr->nic_bit_alloc[i] == NULL) - continue; - new_gres_ptr->nic_bit_alloc[i] = bit_copy(gres_ptr-> - nic_bit_alloc[i]); - } - return new_gres_ptr; -} - -extern int job_state_pack(void *gres_data, Buf buffer) -{ - int i; - nic_job_state_t *gres_ptr = (nic_job_state_t *) gres_data; - - pack32(gres_ptr->nic_cnt_alloc, buffer); - pack8 (gres_ptr->nic_cnt_mult, buffer); - - pack32(gres_ptr->node_cnt, buffer); - for (i=0; i<gres_ptr->node_cnt; i++) - pack_bit_str(gres_ptr->nic_bit_alloc[i], buffer); - - return SLURM_SUCCESS; -} - -extern int job_state_unpack(void **gres_data, Buf buffer) -{ - int i; - nic_job_state_t *gres_ptr; - - gres_ptr = xmalloc(sizeof(nic_job_state_t)); - - if (buffer) { - safe_unpack32(&gres_ptr->nic_cnt_alloc, buffer); - safe_unpack8 (&gres_ptr->nic_cnt_mult, buffer); - - safe_unpack32(&gres_ptr->node_cnt, buffer); - gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) - unpack_bit_str(&gres_ptr->nic_bit_alloc[i], buffer); - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - error("Unpacking %s job state info", plugin_name); - if (gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->nic_bit_alloc[i]); - xfree(gres_ptr->nic_bit_alloc); - } - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - -extern uint32_t job_test(void *job_gres_data, void *node_gres_data, - bool use_total_gres) -{ - uint32_t gres_avail; - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_node_state_t *node_gres_ptr = (nic_node_state_t *) node_gres_data; - - gres_avail = node_gres_ptr->nic_cnt_avail; - if (!use_total_gres) - gres_avail -= node_gres_ptr->nic_cnt_alloc; - - if (job_gres_ptr->nic_cnt_mult == 0) { - /* per node gres limit */ - if (job_gres_ptr->nic_cnt_alloc > gres_avail) - return (uint32_t) 0; - return NO_VAL; - } else { - /* per CPU gres limit */ - return (uint32_t) (gres_avail / job_gres_ptr->nic_cnt_alloc); - } -} - -extern int job_alloc(void *job_gres_data, void *node_gres_data, - int node_cnt, int node_offset, uint32_t cpu_cnt) -{ - int i; - uint32_t gres_cnt; - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_node_state_t *node_gres_ptr = (nic_node_state_t *) node_gres_data; - - /* - * Validate data structures. Either job_gres_data->node_cnt and - * job_gres_data->nic_bit_alloc are both set or both zero/NULL. - */ - xassert(node_cnt); - xassert(node_offset >= 0); - xassert(job_gres_ptr); - xassert(node_gres_ptr); - xassert(node_gres_ptr->nic_bit_alloc); - if (job_gres_ptr->node_cnt == 0) { - job_gres_ptr->node_cnt = node_cnt; - if (job_gres_ptr->nic_bit_alloc) { - error("%s: node_cnt==0 and bit_alloc is set", - plugin_name); - xfree(job_gres_ptr->nic_bit_alloc); - } - job_gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - node_cnt); - } else if (job_gres_ptr->node_cnt < node_cnt) { - error("%s: node_cnt increase from %u to %d", - plugin_name, job_gres_ptr->node_cnt, node_cnt); - if (node_offset >= job_gres_ptr->node_cnt) - return SLURM_ERROR; - } else if (job_gres_ptr->node_cnt > node_cnt) { - error("%s: node_cnt decrease from %u to %d", - plugin_name, job_gres_ptr->node_cnt, node_cnt); - } - - /* - * Check that sufficient resources exist on this node - */ - if (job_gres_ptr->nic_cnt_mult == 0) - gres_cnt = job_gres_ptr->nic_cnt_alloc; - else - gres_cnt = (job_gres_ptr->nic_cnt_alloc * cpu_cnt); - i = node_gres_ptr->nic_cnt_alloc + gres_cnt; - i -= node_gres_ptr->nic_cnt_avail; - if (i > 0) { - error("%s: overallocated resources by %d", plugin_name, i); - /* proceed with request, give job what's available */ - } - - /* - * Select the specific resources to use for this job. - * We'll need to add topology information in the future - */ - if (job_gres_ptr->nic_bit_alloc[node_offset]) { - /* Resuming a suspended job, resources already allocated */ - debug("%s: job's bit_alloc is already set for node %d", - plugin_name, node_offset); - gres_cnt = MIN(bit_size(node_gres_ptr->nic_bit_alloc), - bit_size(job_gres_ptr-> - nic_bit_alloc[node_offset])); - for (i=0; i<gres_cnt; i++) { - if (bit_test(job_gres_ptr->nic_bit_alloc[node_offset], - i)) { - bit_set(node_gres_ptr->nic_bit_alloc, i); - node_gres_ptr->nic_cnt_alloc++; - } - } - } else { - job_gres_ptr->nic_bit_alloc[node_offset] = - bit_alloc(node_gres_ptr->nic_cnt_avail); - if (job_gres_ptr->nic_bit_alloc[node_offset] == NULL) - fatal("bit_copy: malloc failure"); - for (i=0; i<node_gres_ptr->nic_cnt_avail && gres_cnt>0; i++) { - if (bit_test(node_gres_ptr->nic_bit_alloc, i)) - continue; - bit_set(node_gres_ptr->nic_bit_alloc, i); - bit_set(job_gres_ptr->nic_bit_alloc[node_offset], i); - node_gres_ptr->nic_cnt_alloc++; - gres_cnt--; - } - } - - return SLURM_SUCCESS; -} - -extern int job_dealloc(void *job_gres_data, void *node_gres_data, - int node_offset) -{ - int i, len; - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_node_state_t *node_gres_ptr = (nic_node_state_t *) node_gres_data; - - /* - * Validate data structures. Either job_gres_data->node_cnt and - * job_gres_data->nic_bit_alloc are both set or both zero/NULL. - */ - xassert(node_offset >= 0); - xassert(job_gres_ptr); - xassert(node_gres_ptr); - xassert(node_gres_ptr->nic_bit_alloc); - if (job_gres_ptr->node_cnt <= node_offset) { - error("%s: bad node_offset %d count is %u", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return SLURM_ERROR; - } - if (job_gres_ptr->nic_bit_alloc == NULL) { - error("%s: job's bitmap is NULL", plugin_name); - return SLURM_ERROR; - } - if (job_gres_ptr->nic_bit_alloc[node_offset] == NULL) { - error("%s: job's bitmap is empty", plugin_name); - return SLURM_ERROR; - } - - len = bit_size(job_gres_ptr->nic_bit_alloc[node_offset]); - i = bit_size(node_gres_ptr->nic_bit_alloc); - if (i != len) { - error("%s: job and node bitmap sizes differ (%d != %d)", - plugin_name, len, i); - len = MIN(len, i); - /* proceed with request, make best effort */ - } - for (i=0; i<len; i++) { - if (!bit_test(job_gres_ptr->nic_bit_alloc[node_offset], i)) - continue; - bit_clear(node_gres_ptr->nic_bit_alloc, i); - /* NOTE: Do not clear bit from - * job_gres_ptr->nic_bit_alloc[node_offset] - * since this may only be an emulated deallocate */ - node_gres_ptr->nic_cnt_alloc--; - } - - return SLURM_SUCCESS; -} - -extern void job_state_log(void *gres_data, uint32_t job_id) -{ - nic_job_state_t *gres_ptr; - char *mult, tmp_str[128]; - int i; - - xassert(gres_data); - gres_ptr = (nic_job_state_t *) gres_data; - info("%s state for job %u", plugin_name, job_id); - if (gres_ptr->nic_cnt_mult) - mult = "cpu"; - else - mult = "node"; - info(" nic_cnt:%u per %s node_cnt:%u", gres_ptr->nic_cnt_alloc, mult, - gres_ptr->node_cnt); - - if (gres_ptr->node_cnt && gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->nic_bit_alloc[i]); - info(" nic_bit_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" nic_bit_alloc:NULL"); - } - - if (gres_ptr->node_cnt && gres_ptr->nic_bit_step_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->nic_bit_step_alloc[i]); - info(" nic_bit_step_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" nic_bit_step_alloc:NULL"); - } -} - -extern void step_state_delete(void *gres_data) -{ - int i; - nic_step_state_t *gres_ptr = (nic_step_state_t *) gres_data; - - if (gres_ptr == NULL) - return; - - if (gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->nic_bit_alloc[i]); - xfree(gres_ptr->nic_bit_alloc); - } - xfree(gres_ptr); -} - -extern int step_state_validate(char *config, void **gres_data) -{ - char *last = NULL; - nic_job_state_t *gres_ptr; - uint32_t cnt; - uint8_t mult = 0; - - if (!strcmp(config, "nic")) { - cnt = 1; - } else if (!strncmp(config, "nic:", 4)) { - cnt = strtol(config+4, &last, 10); - if (last[0] == '\0') - ; - else if ((last[0] == 'k') || (last[0] == 'K')) - cnt *= 1024; - else if (!strcasecmp(last, "*cpu")) - mult = 1; - else - return SLURM_ERROR; - if (cnt == 0) - return SLURM_ERROR; - } else - return SLURM_ERROR; - - gres_ptr = xmalloc(sizeof(nic_step_state_t)); - gres_ptr->nic_cnt_alloc = cnt; - gres_ptr->nic_cnt_mult = mult; - *gres_data = gres_ptr; - return SLURM_SUCCESS; -} - -extern void *step_state_dup(void *gres_data) -{ - - int i; - nic_step_state_t *gres_ptr = (nic_step_state_t *) gres_data; - nic_step_state_t *new_gres_ptr; - - if (gres_ptr == NULL) - return NULL; - - new_gres_ptr = xmalloc(sizeof(nic_step_state_t)); - new_gres_ptr->nic_cnt_alloc = gres_ptr->nic_cnt_alloc; - new_gres_ptr->nic_cnt_mult = gres_ptr->nic_cnt_mult; - new_gres_ptr->node_cnt = gres_ptr->node_cnt; - new_gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) { - if (gres_ptr->nic_bit_alloc[i] == NULL) - continue; - new_gres_ptr->nic_bit_alloc[i] = bit_copy(gres_ptr-> - nic_bit_alloc[i]); - } - return new_gres_ptr; -} - -extern int step_state_pack(void *gres_data, Buf buffer) -{ - int i; - nic_step_state_t *gres_ptr = (nic_step_state_t *) gres_data; - - pack32(gres_ptr->nic_cnt_alloc, buffer); - pack8 (gres_ptr->nic_cnt_mult, buffer); - - pack32(gres_ptr->node_cnt, buffer); - for (i=0; i<gres_ptr->node_cnt; i++) - pack_bit_str(gres_ptr->nic_bit_alloc[i], buffer); - - return SLURM_SUCCESS; -} - -extern int step_state_unpack(void **gres_data, Buf buffer) -{ - int i; - nic_step_state_t *gres_ptr; - - gres_ptr = xmalloc(sizeof(nic_step_state_t)); - - if (buffer) { - safe_unpack32(&gres_ptr->nic_cnt_alloc, buffer); - safe_unpack8 (&gres_ptr->nic_cnt_mult, buffer); - - safe_unpack32(&gres_ptr->node_cnt, buffer); - gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) - unpack_bit_str(&gres_ptr->nic_bit_alloc[i], buffer); - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - error("Unpacking %s step state info", plugin_name); - if (gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->nic_bit_alloc[i]); - xfree(gres_ptr->nic_bit_alloc); - } - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - -extern void step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id) -{ - nic_step_state_t *gres_ptr = (nic_step_state_t *) gres_data; - char *mult, tmp_str[128]; - int i; - - xassert(gres_ptr); - info("%s state for step %u.%u", plugin_name, job_id, step_id); - if (gres_ptr->nic_cnt_mult) - mult = "cpu"; - else - mult = "node"; - info(" nic_cnt:%u per %s node_cnt:%u", gres_ptr->nic_cnt_alloc, mult, - gres_ptr->node_cnt); - - if (gres_ptr->node_cnt && gres_ptr->nic_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) { - bit_fmt(tmp_str, sizeof(tmp_str), - gres_ptr->nic_bit_alloc[i]); - info(" nic_bit_alloc[%d]:%s", i, tmp_str); - } - } else { - info(" nic_bit_alloc:NULL"); - } -} - -extern uint32_t step_test(void *step_gres_data, void *job_gres_data, - int node_offset, bool ignore_alloc) -{ - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_step_state_t *step_gres_ptr = (nic_step_state_t *) step_gres_data; - uint32_t gres_cnt; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - - if (node_offset == NO_VAL) { - if (step_gres_ptr->nic_cnt_alloc > job_gres_ptr->nic_cnt_alloc) - return 0; - return NO_VAL; - } - - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s step_test node offset invalid (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return 0; - } - if ((job_gres_ptr->nic_bit_alloc == NULL) || - (job_gres_ptr->nic_bit_alloc[node_offset] == NULL)) { - error("%s step_test nic_bit_alloc is NULL", plugin_name); - return 0; - } - - gres_cnt = bit_set_count(job_gres_ptr->nic_bit_alloc[node_offset]); - if (!ignore_alloc && - job_gres_ptr->nic_bit_step_alloc && - job_gres_ptr->nic_bit_step_alloc[node_offset]) { - gres_cnt -= bit_set_count(job_gres_ptr-> - nic_bit_step_alloc[node_offset]); - } - if (step_gres_ptr->nic_cnt_mult) /* Gres count per CPU */ - gres_cnt /= step_gres_ptr->nic_cnt_alloc; - else if (step_gres_ptr->nic_cnt_alloc > gres_cnt) - gres_cnt = 0; - else - gres_cnt = NO_VAL; - - return gres_cnt; -} - -extern int step_alloc(void *step_gres_data, void *job_gres_data, - int node_offset, int cpu_cnt) -{ - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_step_state_t *step_gres_ptr = (nic_step_state_t *) step_gres_data; - uint32_t gres_avail, gres_needed; - bitstr_t *nic_bit_alloc; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - if (node_offset >= job_gres_ptr->node_cnt) { - error("%s step_alloc node offset invalid (%d >= %u)", - plugin_name, node_offset, job_gres_ptr->node_cnt); - return SLURM_ERROR; - } - if ((job_gres_ptr->nic_bit_alloc == NULL) || - (job_gres_ptr->nic_bit_alloc[node_offset] == NULL)) { - error("%s step_alloc nic_bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - - nic_bit_alloc = bit_copy(job_gres_ptr->nic_bit_alloc[node_offset]); - if (nic_bit_alloc == NULL) - fatal("bit_copy malloc failure"); - if (job_gres_ptr->nic_bit_step_alloc && - job_gres_ptr->nic_bit_step_alloc[node_offset]) { - bit_not(job_gres_ptr->nic_bit_step_alloc[node_offset]); - bit_and(nic_bit_alloc, - job_gres_ptr->nic_bit_step_alloc[node_offset]); - bit_not(job_gres_ptr->nic_bit_step_alloc[node_offset]); - } - gres_avail = bit_set_count(nic_bit_alloc); - gres_needed = step_gres_ptr->nic_cnt_alloc; - if (step_gres_ptr->nic_cnt_mult) - gres_needed *= cpu_cnt; - if (gres_needed > gres_avail) { - error("%s step oversubscribing resources on node %d", - plugin_name, node_offset); - } else { - int gres_rem = gres_needed; - int i, len = bit_size(nic_bit_alloc); - for (i=0; i<len; i++) { - if (gres_rem > 0) { - if (bit_test(nic_bit_alloc, i)) - gres_rem--; - } else { - bit_clear(nic_bit_alloc, i); - } - } - } - - if (job_gres_ptr->nic_bit_step_alloc == NULL) { - job_gres_ptr->nic_bit_step_alloc = - xmalloc(sizeof(bitstr_t *) * job_gres_ptr->node_cnt); - } - if (job_gres_ptr->nic_bit_step_alloc[node_offset]) { - bit_or(job_gres_ptr->nic_bit_step_alloc[node_offset], - nic_bit_alloc); - } else { - job_gres_ptr->nic_bit_step_alloc[node_offset] = - bit_copy(nic_bit_alloc); - } - if (step_gres_ptr->nic_bit_alloc == NULL) { - step_gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) * - job_gres_ptr->node_cnt); - step_gres_ptr->node_cnt = job_gres_ptr->node_cnt; - } - if (step_gres_ptr->nic_bit_alloc[node_offset]) { - error("%s step bit_alloc already exists", plugin_name); - bit_or(step_gres_ptr->nic_bit_alloc[node_offset],nic_bit_alloc); - FREE_NULL_BITMAP(nic_bit_alloc); - } else { - step_gres_ptr->nic_bit_alloc[node_offset] = nic_bit_alloc; - } - - return SLURM_SUCCESS; -} - -extern int step_dealloc(void *step_gres_data, void *job_gres_data) -{ - - nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; - nic_step_state_t *step_gres_ptr = (nic_step_state_t *) step_gres_data; - uint32_t i, j, node_cnt; - int len_j, len_s; - - xassert(job_gres_ptr); - xassert(step_gres_ptr); - node_cnt = MIN(job_gres_ptr->node_cnt, step_gres_ptr->node_cnt); - if (step_gres_ptr->nic_bit_alloc == NULL) { - error("%s step dealloc bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - if (job_gres_ptr->nic_bit_alloc == NULL) { - error("%s step dealloc, job's bit_alloc is NULL", plugin_name); - return SLURM_ERROR; - } - for (i=0; i<node_cnt; i++) { - if (step_gres_ptr->nic_bit_alloc[i] == NULL) - continue; - if (job_gres_ptr->nic_bit_alloc[i] == NULL) { - error("%s step dealloc, job's bit_alloc[%d] is NULL", - plugin_name, i); - continue; - } - len_j = bit_size(job_gres_ptr->nic_bit_alloc[i]); - len_s = bit_size(step_gres_ptr->nic_bit_alloc[i]); - if (len_j != len_s) { - error("%s step dealloc, bit_alloc[%d] size mis-match" - "(%d != %d)", len_j, len_s); - len_j = MIN(len_j, len_s); - } - for (j=0; j<len_j; j++) { - if (!bit_test(step_gres_ptr->nic_bit_alloc[i], j)) - continue; - if (job_gres_ptr->nic_bit_step_alloc && - job_gres_ptr->nic_bit_step_alloc[i]) { - bit_clear(job_gres_ptr->nic_bit_step_alloc[i], - j); - } - } - FREE_NULL_BITMAP(step_gres_ptr->nic_bit_alloc[i]); - } - - return SLURM_SUCCESS; -} -- GitLab