diff --git a/src/common/node_select.c b/src/common/node_select.c index 7d017dd8d497d719ca61443d458b2f280db41319..32c76a4d2cb38bb536973237dc7297de28d3e49f 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -58,13 +58,14 @@ typedef struct slurm_select_ops { int (*state_save) ( char *dir_name ); int (*state_restore) ( char *dir_name ); + int (*job_init) ( List job_list ); int (*node_init) ( struct node_record *node_ptr, int node_cnt); int (*part_init) ( List part_list ); int (*job_test) ( struct job_record *job_ptr, bitstr_t *bitmap, int min_nodes, int max_nodes ); - int (*job_init) ( struct job_record *job_ptr ); + int (*job_begin) ( struct job_record *job_ptr ); int (*job_fini) ( struct job_record *job_ptr ); } slurm_select_ops_t; @@ -111,10 +112,11 @@ static slurm_select_ops_t * _select_get_ops(slurm_select_context_t *c) static const char *syms[] = { "select_p_state_save", "select_p_state_restore", + "select_p_job_init", "select_p_node_init", "select_p_part_init", "select_p_job_test", - "select_p_job_init", + "select_p_job_begin", "select_p_job_fini" }; int n_syms = sizeof( syms ) / sizeof( char * ); @@ -269,6 +271,18 @@ extern int select_g_state_restore(char *dir_name) return (*(g_select_context->ops.state_restore))(dir_name); } +/* + * Note the initialization of job records, issued upon restart of + * slurmctld and used to synchronize any job state. + */ +extern int select_g_job_init(List job_list) +{ + if (slurm_select_init() < 0) + return SLURM_ERROR; + + return (*(g_select_context->ops.job_init))(job_list); +} + /* * Note re/initialization of node record data structure * IN node_ptr - current node data @@ -318,12 +332,12 @@ extern int select_g_job_test(struct job_record *job_ptr, bitstr_t *bitmap, * after select_g_job_test(). Executed from slurmctld. * IN job_ptr - pointer to job being initiated */ -extern int select_g_job_init(struct job_record *job_ptr) +extern int select_g_job_begin(struct job_record *job_ptr) { if (slurm_select_init() < 0) return SLURM_ERROR; - return (*(g_select_context->ops.job_init))(job_ptr); + return (*(g_select_context->ops.job_begin))(job_ptr); } /* diff --git a/src/common/node_select.h b/src/common/node_select.h index 66aeb5a010c53015abab0ede14a53f63d7f1e096..e91ba41e55e5a05b424e8e7c7b98994c3d094bbc 100644 --- a/src/common/node_select.h +++ b/src/common/node_select.h @@ -70,6 +70,12 @@ extern int select_g_node_init(struct node_record *node_ptr, int node_cnt); */ extern int select_g_part_init(List part_list); +/* + * Note the initialization of job records, issued upon restart of + * slurmctld and used to synchronize any job state. + */ +extern int select_g_job_init(List job_list); + /******************************************************\ * JOB-SPECIFIC SELECT CREDENTIAL MANAGEMENT FUNCIONS * \******************************************************/ @@ -90,7 +96,7 @@ extern int select_g_job_test(struct job_record *job_ptr, bitstr_t *bitmap, * after select_g_job_test(). Executed from slurmctld. * IN job_ptr - pointer to job being initiated */ -extern int select_g_job_init(struct job_record *job_ptr); +extern int select_g_job_begin(struct job_record *job_ptr); /* * Note termination of job is starting. Executed from slurmctld. diff --git a/src/plugins/select/bluegene/bgl_job_run.c b/src/plugins/select/bluegene/bgl_job_run.c index bda91f572551cbfd4e2237541685853f6f766c6f..1c45c7f33a4a1451bd9fbd950770fde036031af1 100644 --- a/src/plugins/select/bluegene/bgl_job_run.c +++ b/src/plugins/select/bluegene/bgl_job_run.c @@ -45,6 +45,7 @@ #include "src/common/macros.h" #include "src/common/node_select.h" #include "src/common/uid.h" +#include "src/common/xstring.h" #include "src/slurmctld/proc_req.h" #include "bgl_job_run.h" #include "bluegene.h" @@ -55,9 +56,10 @@ #define MAX_PTHREAD_RETRIES 1 #define POLL_INTERVAL 2 +enum update_op {START_OP, TERM_OP, SYNC_OP}; typedef struct bgl_update { - bool start; /* true=start, false=terminate */ + enum update_op op; /* start | terminate | sync */ uid_t uid; /* new owner */ uint32_t job_id; /* SLURM job id */ pm_partition_id_t bgl_part_id; @@ -67,6 +69,22 @@ List bgl_update_list = NULL; static pthread_mutex_t agent_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; static int agent_cnt = 0; +static void _bgl_list_del(void *x); +static void _block_list_elem_del(void *x); +static int _boot_part(pm_partition_id_t bgl_part_id); +static void _excise_block(List block_list, pm_partition_id_t bgl_part_id); +static List _get_all_blocks(void); +static char * _get_part_owner(pm_partition_id_t bgl_part_id); +static int _match_block_name(void *x, void *key); +static void * _part_agent(void *args); +static void _part_op(bgl_update_t *bgl_update_ptr); +static int _remove_job(db_job_id_t job_id); +static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user); +static void _start_agent(bgl_update_t *bgl_update_ptr); +static void _sync_agent(bgl_update_t *bgl_update_ptr); +static void _term_agent(bgl_update_t *bgl_update_ptr); + + /* Delete a bgl_update_t record */ static void _bgl_list_del(void *x) { @@ -139,6 +157,29 @@ static int _remove_job(db_job_id_t job_id) return INTERNAL_ERROR; } +/* Get the owner of an existing partition. Caller must xfree() return value. */ +static char *_get_part_owner(pm_partition_id_t bgl_part_id) +{ + int rc; + char *owner, *cur_owner; + rm_partition_t * part_elem; + + if ((rc = rm_get_partition(bgl_part_id, &part_elem)) != STATUS_OK) { + error("rm_get_partition(%s): %s", bgl_part_id, bgl_err_str(rc)); + return NULL; + } + if ((rc = rm_get_data(part_elem, RM_PartitionUserName, &owner)) != + STATUS_OK) { + error("rm_get_data(RM_PartitionUserName): %s", bgl_err_str(rc)); + (void) rm_free_partition(part_elem); + return NULL; + } + cur_owner = xstrdup(owner); + if ((rc = rm_free_partition(part_elem)) != STATUS_OK) + error("rm_free_partition(): %s", bgl_err_str(rc)); + return cur_owner; +} + /* Set the owner of an existing partition */ static int _set_part_owner(pm_partition_id_t bgl_part_id, char *user) { @@ -217,8 +258,23 @@ static int _boot_part(pm_partition_id_t bgl_part_id) return SLURM_SUCCESS; } +/* Update partition owner and reboot as needed */ +static void _sync_agent(bgl_update_t *bgl_update_ptr) +{ + char *cur_part_owner, *new_part_owner; + + cur_part_owner = _get_part_owner(bgl_update_ptr->bgl_part_id); + new_part_owner = uid_to_string(bgl_update_ptr->uid); + if (strcmp(cur_part_owner, new_part_owner)) { + /* need to change owner */ + _term_agent(bgl_update_ptr); + _start_agent(bgl_update_ptr); + } + xfree(cur_part_owner); +} + /* Perform job initiation work */ -extern void _start_agent(bgl_update_t *bgl_update_ptr) +static void _start_agent(bgl_update_t *bgl_update_ptr) { int rc; @@ -311,10 +367,12 @@ static void *_part_agent(void *args) return NULL; } slurm_mutex_unlock(&agent_cnt_mutex); - if (bgl_update_ptr->start) + if (bgl_update_ptr->op == START_OP) _start_agent(bgl_update_ptr); - else + else if (bgl_update_ptr->op == TERM_OP) _term_agent(bgl_update_ptr); + else if (bgl_update_ptr->op == SYNC_OP) + _sync_agent(bgl_update_ptr); _bgl_list_del(bgl_update_ptr); } } @@ -350,6 +408,53 @@ static void _part_op(bgl_update_t *bgl_update_ptr) usleep(1000); /* sleep and retry */ } } + +static void _block_list_elem_del(void *x) +{ + xfree(x); +} + +static int _match_block_name(void *x, void *key) +{ + pm_partition_id_t elem = (pm_partition_id_t) x; + pm_partition_id_t part_id = (pm_partition_id_t) key; + + if (strcmp(elem, part_id) == 0) + return 1; /* part_id matches */ + return 0; +} + +/* get a list of all BGL blocks with owners */ +static List _get_all_blocks(void) +{ + List ret_list = list_create(_block_list_elem_del); + ListIterator itr; + bgl_record_t *block_ptr; + char *part_id; + + if (!ret_list) + fatal("malloc error"); + + itr = list_iterator_create(bgl_init_part_list); + while ((block_ptr = (bgl_record_t *) list_next(itr))) { + if ((block_ptr->owner_name == NULL) + || (block_ptr->owner_name[0] == '\0') + || (block_ptr->bgl_part_id == NULL) + || (block_ptr->bgl_part_id[0] == '0')) + continue; + part_id = xstrdup(block_ptr->bgl_part_id); + list_append(ret_list, part_id); + } + list_iterator_destroy(itr); + + return ret_list; +} + +/* remove a BGL block from the given list */ +static void _excise_block(List block_list, pm_partition_id_t bgl_part_id) +{ + list_delete_all(block_list, _match_block_name, bgl_part_id); +} #endif /* @@ -380,7 +485,7 @@ extern int start_job(struct job_record *job_ptr) } bgl_update_ptr = xmalloc(sizeof(bgl_update_t)); - bgl_update_ptr->start = true; + bgl_update_ptr->op = START_OP; bgl_update_ptr->uid = job_ptr->user_id; bgl_update_ptr->job_id = job_ptr->job_id; bgl_update_ptr->bgl_part_id = bgl_part_id; @@ -412,7 +517,7 @@ extern int term_job(struct job_record *job_ptr) job_ptr->job_id, bgl_part_id); bgl_update_ptr = xmalloc(sizeof(bgl_update_t)); - bgl_update_ptr->start = false; + bgl_update_ptr->op = TERM_OP; bgl_update_ptr->uid = job_ptr->user_id; bgl_update_ptr->job_id = job_ptr->job_id; bgl_update_ptr->bgl_part_id = bgl_part_id; @@ -421,3 +526,62 @@ extern int term_job(struct job_record *job_ptr) return rc; } +/* + * Synchronize BGL block state to that of currently active jobs. + * This can recover from slurmctld crashes when partition ownership + * changes were queued + */ +extern int sync_jobs(List job_list) +{ +#ifdef HAVE_BGL_FILES + ListIterator job_iterator, block_iterator; + struct job_record *job_ptr; + pm_partition_id_t bgl_part_id; + bgl_update_t *bgl_update_ptr; + List block_list = _get_all_blocks(); + + /* Insure that all running jobs own the specified partition */ + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if (job_ptr->job_state != JOB_RUNNING) + continue; + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_PART_ID, &bgl_part_id); +#ifdef USE_BGL_BLOCK + debug3("Queue sync of job %u in BGL partition %s", + job_ptr->job_id, bgl_part_id); + bgl_update_ptr = xmalloc(sizeof(bgl_update_t)); + bgl_update_ptr->op = SYNC_OP; + bgl_update_ptr->uid = job_ptr->user_id; + bgl_update_ptr->job_id = job_ptr->job_id; + bgl_update_ptr->bgl_part_id = bgl_part_id; +#else + info("Queue sync of job %u in BGL partition %s", + job_ptr->job_id, bgl_part_id); +#endif + _excise_block(block_list, bgl_part_id); + _part_op(bgl_update_ptr); + } + list_iterator_destroy(job_iterator); + + /* Insure that all other partitions are free */ + block_iterator = list_iterator_create(block_list); + while ((bgl_part_id = (pm_partition_id_t) list_next(block_iterator))) { +#ifdef USE_BGL_BLOCK + debug3("Queue clearing of vestigial owner in BGL partition %s", + bgl_part_id); + bgl_update_ptr = xmalloc(sizeof(bgl_update_t)); + bgl_update_ptr->op = TERM_OP; + bgl_update_ptr->bgl_part_id = bgl_part_id; + _part_op(bgl_update_ptr); +#else + info("Queue clearing of vestigial owner in BGL partition %s", + bgl_part_id); +#endif + } + list_iterator_destroy(block_iterator); + list_destroy(block_list); +#endif + return SLURM_SUCCESS; +} + diff --git a/src/plugins/select/bluegene/bgl_job_run.h b/src/plugins/select/bluegene/bgl_job_run.h index 508cb8b36d45068efd6b2a76f9a99644ea8e5923..0e249bed5ccb20792b0abd8e0665b4e94514e84d 100644 --- a/src/plugins/select/bluegene/bgl_job_run.h +++ b/src/plugins/select/bluegene/bgl_job_run.h @@ -51,4 +51,10 @@ extern int start_job(struct job_record *job_ptr); */ extern int term_job(struct job_record *job_ptr); +/* + * Synchronize BGL block state to that of currently active jobs. + * This can recover from slurmctld crashes when partition ownership + * changes were queued + */ +extern int sync_jobs(List job_list); #endif /* _BGL_JOB_RUN_H_ */ diff --git a/src/plugins/select/bluegene/bluegene.c b/src/plugins/select/bluegene/bluegene.c index f36c8c8ce1a4a484fd4677ea37aa6cc0644c3778..dee3ad4a89bbdbe7862afe155ef59de17abf6f8b 100644 --- a/src/plugins/select/bluegene/bluegene.c +++ b/src/plugins/select/bluegene/bluegene.c @@ -106,7 +106,7 @@ extern int create_static_partitions(List part_list) if ((rc = _copy_slurm_partition_list(part_list))) return rc; - /* syncronize slurm.conf and bluegene.conf data */ + /* synchronize slurm.conf and bluegene.conf data */ _process_config(); /* @@ -548,6 +548,7 @@ static void _destroy_bgl_record(void* object) if (this_record) { xfree(this_record->nodes); + xfree(this_record->owner_name); xfree(this_record->slurm_part_id); if (this_record->hostlist) hostlist_destroy(this_record->hostlist); diff --git a/src/plugins/select/bluegene/bluegene.h b/src/plugins/select/bluegene/bluegene.h index 4686d4c24c3f0b70562382739219bfff7266bc30..8443bec1def453230c291b7dce48056e9339f34d 100644 --- a/src/plugins/select/bluegene/bluegene.h +++ b/src/plugins/select/bluegene/bluegene.h @@ -74,6 +74,7 @@ enum part_lifecycle {DYNAMIC, STATIC}; typedef struct bgl_record { char* slurm_part_id; /* ID specified by admins */ + char * owner_name; /* Owner of partition */ pm_partition_id_t bgl_part_id; /* ID returned from CMCS */ char* nodes; /* String of nodes in partition */ lifecycle_type_t part_lifecycle;/* either STATIC or DYNAMIC */ @@ -81,7 +82,7 @@ typedef struct bgl_record { bitstr_t *bitmap; /* bitmap of nodes for this partition */ struct partition* alloc_part; /* the allocated partition */ int size; /* node count for the partitions */ - rm_connection_type_t conn_type;/* Mesh or Torus or NAV */ + rm_connection_type_t conn_type; /* Mesh or Torus or NAV */ rm_partition_mode_t node_use; /* either COPROCESSOR or VIRTUAL */ } bgl_record_t; diff --git a/src/plugins/select/bluegene/partition_sys.c b/src/plugins/select/bluegene/partition_sys.c index 81240979930bcfa6b5e4dd0272df2cf12098f085..09dad4a563b28e02a37a0a9f5ceff98706848014 100755 --- a/src/plugins/select/bluegene/partition_sys.c +++ b/src/plugins/select/bluegene/partition_sys.c @@ -1040,7 +1040,7 @@ extern int read_bgl_partitions(void) rm_location_t bp_loc; pm_partition_id_t part_id; rm_partition_t *part_ptr; - char node_name_tmp[16]; + char node_name_tmp[16], *owner_name; bgl_record_t *bgl_part_ptr; if (!bgl_init_part_list) @@ -1127,9 +1127,17 @@ extern int read_bgl_partitions(void) error("rm_get_data(RM_PartitionMode): %s", bgl_err_str(rm_rc)); } - info("BglBlock:%s Conn:%s Use:%s", part_id, + if ((rm_rc = rm_get_data(part_ptr, + RM_PartitionUserName, + &owner_name)) != STATUS_OK) { + error("rm_get_data(RM_PartitionUserName): %s", + bgl_err_str(rm_rc)); + } else + bgl_part_ptr->owner_name = xstrdup(owner_name); + info("BglBlock:%s Conn:%s Use:%s Owner:%s", part_id, convert_conn_type(bgl_part_ptr->conn_type), - convert_node_use(bgl_part_ptr->node_use)); + convert_node_use(bgl_part_ptr->node_use), + owner_name); bgl_part_ptr->part_lifecycle = STATIC; if ((rm_rc = rm_free_partition(part_ptr)) != STATUS_OK) { @@ -1154,7 +1162,7 @@ extern int read_bgl_partitions(void) static int _post_bgl_init_read(void *object, void *arg) { bgl_record_t *bgl_part_ptr = (bgl_record_t *) object; - int i = 32; + int i = 1024; bgl_part_ptr->nodes = xmalloc(i); while (hostlist_ranged_string(bgl_part_ptr->hostlist, i, diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 4da7be1954b0f79a596b9c4bdcd17d8fcef1390d..3f8c52cd7b5e7b604c874466390c4c4469b5309f 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -240,6 +240,12 @@ extern int select_p_state_restore(char *dir_name) return SLURM_SUCCESS; } +/* Sync BGL blocks to currently active jobs */ +extern int select_p_job_init(List job_list) +{ + return sync_jobs(job_list); +} + /* All initialization is performed by select_p_part_init() */ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) { @@ -277,7 +283,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, return SLURM_SUCCESS; } -extern int select_p_job_init(struct job_record *job_ptr) +extern int select_p_job_begin(struct job_record *job_ptr) { return start_job(job_ptr); } diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index e508c9998b12068ca02e60e37bf72fab36654ac3..60597cea1947186b4c912f4c8db838afa5f775e0 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -128,6 +128,11 @@ extern int select_p_state_restore(char *dir_name) return SLURM_SUCCESS; } +extern int select_p_job_init(List job_list) +{ + return SLURM_SUCCESS; +} + extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) { if (node_ptr == NULL) { @@ -386,7 +391,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, return error_code; } -extern int select_p_job_init(struct job_record *job_ptr) +extern int select_p_job_begin(struct job_record *job_ptr) { return SLURM_SUCCESS; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 49924ace4651182207610e8a9cd861d9f2bbb4a0..a287e4ffc444b599343a94c8e0ebb7104c954e4e 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -724,9 +724,9 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only) error_code = SLURM_SUCCESS; goto cleanup; } - if (select_g_job_init(job_ptr) != SLURM_SUCCESS) { + if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) { /* Leave job queued, something is hosed */ - error("select_g_job_init(%u): %m", job_ptr->job_id); + error("select_g_job_begin(%u): %m", job_ptr->job_id); error_code = ESLURM_NODES_BUSY; goto cleanup; } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index e1bb69765fa5ef7f7449a790ac5c078ae55e3b9b..ce07cf73f43ebdc5e2043b4fdfa57c563a118648 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -894,12 +894,10 @@ int read_slurm_conf(int recover) #ifdef HAVE_ELAN _validate_node_proc_count(); #endif - if (select_g_node_init(node_record_table_ptr, node_record_count) - != SLURM_SUCCESS ) { - error("failed to initialize node selection plugin state"); - abort(); - } - if (select_g_part_init(part_list) != SLURM_SUCCESS ) { + if ((select_g_node_init(node_record_table_ptr, node_record_count) + != SLURM_SUCCESS) + || (select_g_part_init(part_list) != SLURM_SUCCESS) + || (select_g_job_init(job_list) != SLURM_SUCCESS)) { error("failed to initialize node selection plugin state"); abort(); }