From 9c2f11d2d247570d6cf4e1052a192896c1e6e18a Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Thu, 3 Sep 2009 23:29:33 +0000 Subject: [PATCH] added checks for preemption loops and modify qos --- slurm/slurm_errno.h | 1 + src/common/assoc_mgr.c | 28 ++++- src/common/slurm_errno.c | 2 + src/common/slurmdbd_defs.c | 27 ++++- .../mysql/accounting_storage_mysql.c | 100 +++++++++++++++--- src/sacctmgr/qos_functions.c | 6 +- src/sacctmgr/sacctmgr.c | 5 +- src/sacctmgr/sacctmgr.h | 1 + src/slurmdbd/proc_req.c | 3 + src/slurmdbd/slurmdbd.c | 6 +- 10 files changed, 155 insertions(+), 24 deletions(-) diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 93605fe16ff..b1e6eafc773 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -176,6 +176,7 @@ enum { ESLURM_INVALID_BLOCK_LAYOUT, ESLURM_INVALID_BLOCK_NAME, ESLURM_INVALID_QOS, + ESLURM_QOS_PREEMPTION_LOOP, /* switch specific error codes, specific values defined in plugin module */ ESLURM_SWITCH_MIN = 3000, diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 95b367abaac..68f1c5f8f11 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -1193,7 +1193,10 @@ extern int assoc_mgr_fill_in_qos(void *db_conn, acct_qos_rec_t *qos, qos->norm_priority = found_qos->norm_priority; - if(!qos->preempt_bitstr) + if(qos->preempt_bitstr) { + FREE_NULL_BITMAP(qos->preempt_bitstr); + qos->preempt_bitstr = bit_copy(found_qos->preempt_bitstr); + } else qos->preempt_bitstr = found_qos->preempt_bitstr; qos->priority = found_qos->priority; @@ -2093,7 +2096,14 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) break; } list_append(assoc_mgr_qos_list, object); - +/* char *tmp = get_qos_complete_str_bitstr( */ +/* assoc_mgr_qos_list, */ +/* object->preempt_bitstr); */ + +/* info("new qos %s(%d) now preempts %s", */ +/* object->name, object->id, tmp); */ +/* xfree(tmp); */ + /* Since in the database id's don't start at 1 instead of 0 we need to ignore the 0 bit and start with 1 so increase the count by 1. @@ -2143,6 +2153,13 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) rec->preempt_bitstr = object->preempt_bitstr; object->preempt_bitstr = NULL; + /* char *tmp = get_qos_complete_str_bitstr( */ +/* assoc_mgr_qos_list, */ +/* rec->preempt_bitstr); */ + +/* info("qos %s(%d) now preempts %s", */ +/* rec->name, rec->id, tmp); */ +/* xfree(tmp); */ } if(object->priority != NO_VAL) @@ -2153,6 +2170,11 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) break; case ACCT_REMOVE_QOS: + if(rec) + list_delete_item(itr); + + if(!assoc_mgr_association_list) + break; /* Remove this qos from all the associations on this cluster. */ @@ -2169,8 +2191,6 @@ extern int assoc_mgr_update_qos(acct_update_object_t *update) list_iterator_destroy(assoc_itr); slurm_mutex_unlock(&assoc_mgr_association_lock); - if(rec) - list_delete_item(itr); break; default: break; diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 55171b69415..8cb77ddc222 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -254,6 +254,8 @@ static slurm_errtab_t slurm_errtab[] = { "Functionality not available with current block layout mode"}, { ESLURM_INVALID_BLOCK_NAME, "Invalid block name specified" }, + { ESLURM_QOS_PREEMPTION_LOOP, + "QOS Preemption loop detected" }, /* slurmd error codes */ diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 5662afa032c..7a4fbe660b0 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -252,7 +252,10 @@ extern int slurm_send_recv_slurmdbd_msg(uint16_t rpc_version, } } - buffer = pack_slurmdbd_msg(rpc_version, req); + if(!(buffer = pack_slurmdbd_msg(rpc_version, req))) { + rc = SLURM_ERROR; + goto end_it; + } rc = _send_msg(buffer); free_buf(buffer); @@ -501,6 +504,7 @@ extern Buf pack_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req) case DBD_MODIFY_ACCOUNTS: case DBD_MODIFY_ASSOCS: case DBD_MODIFY_CLUSTERS: + case DBD_MODIFY_QOS: case DBD_MODIFY_USERS: slurmdbd_pack_modify_msg( rpc_version, req->msg_type, @@ -672,6 +676,7 @@ extern int unpack_slurmdbd_msg(uint16_t rpc_version, case DBD_MODIFY_ACCOUNTS: case DBD_MODIFY_ASSOCS: case DBD_MODIFY_CLUSTERS: + case DBD_MODIFY_QOS: case DBD_MODIFY_USERS: rc = slurmdbd_unpack_modify_msg( rpc_version, @@ -801,6 +806,8 @@ extern slurmdbd_msg_type_t str_2_slurmdbd_msg_type(char *msg_type) return DBD_MODIFY_ASSOCS; } else if(!strcasecmp(msg_type, "Modify Clusters")) { return DBD_MODIFY_CLUSTERS; + } else if(!strcasecmp(msg_type, "Modify QOS")) { + return DBD_MODIFY_QOS; } else if(!strcasecmp(msg_type, "Modify Users")) { return DBD_MODIFY_USERS; } else if(!strcasecmp(msg_type, "Node State")) { @@ -1077,6 +1084,12 @@ extern char *slurmdbd_msg_type_2_str(slurmdbd_msg_type_t msg_type, int get_enum) } else return "Modify Clusters"; break; + case DBD_MODIFY_QOS: + if(get_enum) { + return "DBD_MODIFY_QOS"; + } else + return "Modify QOS"; + break; case DBD_MODIFY_USERS: if(get_enum) { return "DBD_MODIFY_USERS"; @@ -2213,6 +2226,10 @@ void inline slurmdbd_free_modify_msg(uint16_t rpc_version, destroy_cond = destroy_acct_cluster_cond; destroy_rec = destroy_acct_cluster_rec; break; + case DBD_MODIFY_QOS: + destroy_cond = destroy_acct_qos_cond; + destroy_rec = destroy_acct_qos_rec; + break; case DBD_MODIFY_USERS: destroy_cond = destroy_acct_user_cond; destroy_rec = destroy_acct_user_rec; @@ -3249,6 +3266,10 @@ void inline slurmdbd_pack_modify_msg(uint16_t rpc_version, my_cond = pack_acct_cluster_cond; my_rec = pack_acct_cluster_rec; break; + case DBD_MODIFY_QOS: + my_cond = pack_acct_qos_cond; + my_rec = pack_acct_qos_rec; + break; case DBD_MODIFY_USERS: my_cond = pack_acct_user_cond; my_rec = pack_acct_user_rec; @@ -3285,6 +3306,10 @@ int inline slurmdbd_unpack_modify_msg(uint16_t rpc_version, my_cond = unpack_acct_cluster_cond; my_rec = unpack_acct_cluster_rec; break; + case DBD_MODIFY_QOS: + my_cond = unpack_acct_qos_cond; + my_rec = unpack_acct_qos_rec; + break; case DBD_MODIFY_USERS: my_cond = unpack_acct_user_cond; my_rec = unpack_acct_user_rec; diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index b2cdecee040..eb55c61446f 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -348,6 +348,43 @@ no_wckeyid: return wckeyid; } +static int _preemption_loop(mysql_conn_t *mysql_conn, int begin_qosid, + bitstr_t *preempt_bitstr) +{ + acct_qos_rec_t qos_rec; + int rc = 0, i=0; + + xassert(preempt_bitstr); + + /* check in the preempt list for all qos's preempted */ + for(i=0; i<bit_size(preempt_bitstr); i++) { + if(!bit_test(preempt_bitstr, i)) + continue; + + memset(&qos_rec, 0, sizeof(qos_rec)); + qos_rec.id = i; + assoc_mgr_fill_in_qos(mysql_conn, &qos_rec, + ACCOUNTING_ENFORCE_QOS, + NULL); + /* check if the begin_qosid is preempted by this qos + * if so we have a loop */ + if(qos_rec.preempt_bitstr + && bit_test(qos_rec.preempt_bitstr, begin_qosid)) { + error("QOS id %d has a loop at QOS %s", + begin_qosid, qos_rec.name); + rc = 1; + break; + } else if(qos_rec.preempt_bitstr) { + /* check this qos' preempt list and make sure + no loops exist there either */ + if((rc = _preemption_loop(mysql_conn, begin_qosid, + qos_rec.preempt_bitstr))) + break; + } + } + return rc; +} + static int _set_usage_information(char **usage_table, slurmdbd_msg_type_t type, time_t *usage_start, time_t *usage_end) { @@ -705,7 +742,7 @@ end_qos: static int _setup_qos_limits(acct_qos_rec_t *qos, char **cols, char **vals, - char **extra) + char **extra, char **added_preempt) { if(!qos) return SLURM_ERROR; @@ -880,11 +917,17 @@ static int _setup_qos_limits(acct_qos_rec_t *qos, "replace(%s, ',%s', ''), ',%s')", begin_preempt, tmp_char+1, tmp_char+1); + if(added_preempt) + xstrfmtcat(*added_preempt, ",%s", + tmp_char+1); xfree(begin_preempt); begin_preempt = preempt_val; - } else if(tmp_char[0]) + } else if(tmp_char[0]) { xstrfmtcat(preempt_val, ",%s", tmp_char); - else + if(added_preempt) + xstrfmtcat(*added_preempt, ",%s", + tmp_char); + } else xstrcat(preempt_val, ""); } list_iterator_destroy(preempt_itr); @@ -4702,6 +4745,7 @@ extern int acct_storage_p_add_qos(mysql_conn_t *mysql_conn, uint32_t uid, char *user_name = NULL; int affect_rows = 0; int added = 0; + char *added_preempt = NULL; if(_check_connection(mysql_conn) != SLURM_SUCCESS) return SLURM_ERROR; @@ -4719,7 +4763,13 @@ extern int acct_storage_p_add_qos(mysql_conn_t *mysql_conn, uint32_t uid, now, now, object->name); xstrfmtcat(extra, ", mod_time=%d", now); - _setup_qos_limits(object, &cols, &vals, &extra); + _setup_qos_limits(object, &cols, &vals, &extra, &added_preempt); + if(added_preempt) { + object->preempt_bitstr = bit_alloc(g_qos_count); + bit_unfmt(object->preempt_bitstr, added_preempt+1); + xfree(added_preempt); + } + xstrfmtcat(query, "insert into %s (%s) values (%s) " "on duplicate key update deleted=0, " @@ -4749,9 +4799,7 @@ extern int acct_storage_p_add_qos(mysql_conn_t *mysql_conn, uint32_t uid, xfree(vals); continue; } - /* FIX ME: we have to edit all the other qos's to set - there preemptee or preemptor based on what is here. - */ + /* we always have a ', ' as the first 2 chars */ tmp_extra = _fix_double_quotes(extra+2); @@ -5899,6 +5947,8 @@ extern List acct_storage_p_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, MYSQL_RES *result = NULL; MYSQL_ROW row; char *tmp_char1=NULL, *tmp_char2=NULL; + bitstr_t *preempt_bitstr = NULL; + char *added_preempt = NULL; if(!qos_cond || !qos) { error("we need something to change"); @@ -5955,20 +6005,27 @@ extern List acct_storage_p_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, xstrcat(extra, ")"); } - _setup_qos_limits(qos, &tmp_char1, &tmp_char2, &vals); + _setup_qos_limits(qos, &tmp_char1, &tmp_char2, &vals, &added_preempt); + if(added_preempt) { + preempt_bitstr = bit_alloc(g_qos_count); + bit_unfmt(preempt_bitstr, added_preempt+1); + xfree(added_preempt); + } xfree(tmp_char1); xfree(tmp_char2); if(!extra || !vals) { errno = SLURM_NO_CHANGE_IN_DATA; + FREE_NULL_BITMAP(preempt_bitstr); error("Nothing to change"); return NULL; } - query = xstrdup_printf("select name, preempt from %s %s;", + query = xstrdup_printf("select name, preempt, id from %s %s;", qos_table, extra); xfree(extra); if(!(result = mysql_db_query_ret(mysql_conn->db_conn, query, 0))) { xfree(query); + FREE_NULL_BITMAP(preempt_bitstr); return NULL; } @@ -5976,7 +6033,11 @@ extern List acct_storage_p_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, ret_list = list_create(slurm_destroy_char); while((row = mysql_fetch_row(result))) { acct_qos_rec_t *qos_rec = NULL; - + if(preempt_bitstr) { + if(_preemption_loop(mysql_conn, + atoi(row[2]), preempt_bitstr)) + break; + } object = xstrdup(row[0]); list_append(ret_list, object); if(!rc) { @@ -5985,6 +6046,7 @@ extern List acct_storage_p_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, } else { xstrfmtcat(name_char, " || name='%s'", object); } + qos_rec = xmalloc(sizeof(acct_qos_rec_t)); qos_rec->name = xstrdup(object); @@ -6041,6 +6103,18 @@ extern List acct_storage_p_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, } mysql_free_result(result); + FREE_NULL_BITMAP(preempt_bitstr); + + if(row) { + xfree(vals); + xfree(name_char); + xfree(query); + list_destroy(ret_list); + ret_list = NULL; + errno = ESLURM_QOS_PREEMPTION_LOOP; + return ret_list; + } + if(!list_count(ret_list)) { errno = SLURM_NO_CHANGE_IN_DATA; debug3("didn't effect anything\n%s", query); @@ -8860,9 +8934,11 @@ empty: else qos->max_wall_pu = INFINITE; - if(row[QOS_REQ_PREE] && row[QOS_REQ_PREE][0]) + if(row[QOS_REQ_PREE] && row[QOS_REQ_PREE][0]) { + if(!qos->preempt_bitstr) + qos->preempt_bitstr = bit_alloc(g_qos_count); bit_unfmt(qos->preempt_bitstr, row[QOS_REQ_PREE]+1); - + } if(row[QOS_REQ_PRIO]) qos->priority = atoi(row[QOS_REQ_PRIO]); diff --git a/src/sacctmgr/qos_functions.c b/src/sacctmgr/qos_functions.c index 56053935038..8c5aeb4ee71 100644 --- a/src/sacctmgr/qos_functions.c +++ b/src/sacctmgr/qos_functions.c @@ -68,6 +68,7 @@ static int _set_cond(int *start, int argc, char *argv[], if (!strncasecmp (argv[i], "Set", MAX(command_len, 3))) { i--; + break; } else if (!strncasecmp (argv[i], "WithDeleted", MAX(command_len, 5))) { qos_cond->with_deleted = 1; @@ -131,7 +132,6 @@ static int _set_cond(int *start, int argc, char *argv[], } (*start) = i; - return set; } @@ -282,14 +282,14 @@ static int _set_rec(int *start, int argc, char *argv[], argv[i]); } } else if (!strncasecmp (argv[i], "Preempt", - MAX(command_len, 9))) { + MAX(command_len, 3))) { if(!qos) continue; if(!qos->preempt_list) qos->preempt_list = list_create(slurm_destroy_char); - + if(!g_qos_list) g_qos_list = acct_storage_g_get_qos( db_conn, my_uid, NULL); diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index c28bb65c59b..58a0f1b5f2d 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -670,6 +670,8 @@ static void _modify_it (int argc, char *argv[]) } else if (strncasecmp (argv[0], "Clusters", MAX(command_len, 1)) == 0) { error_code = sacctmgr_modify_cluster((argc - 1), &argv[1]); + } else if (strncasecmp (argv[0], "QOSs", MAX(command_len, 1)) == 0) { + error_code = sacctmgr_modify_qos((argc - 1), &argv[1]); } else if (strncasecmp (argv[0], "Users", MAX(command_len, 1)) == 0) { error_code = sacctmgr_modify_user((argc - 1), &argv[1]); } else { @@ -677,7 +679,8 @@ static void _modify_it (int argc, char *argv[]) exit_code = 1; fprintf(stderr, "No valid entity in modify command\n"); fprintf(stderr, "Input line must include "); - fprintf(stderr, "\"Account\", \"Cluster\", or \"User\"\n"); + fprintf(stderr, "\"Account\", \"Cluster\", \"QOS\", " + "or \"User\"\n"); } if (error_code == SLURM_ERROR) { diff --git a/src/sacctmgr/sacctmgr.h b/src/sacctmgr/sacctmgr.h index f54b3ddfbf2..55a72c7ff51 100644 --- a/src/sacctmgr/sacctmgr.h +++ b/src/sacctmgr/sacctmgr.h @@ -120,6 +120,7 @@ extern int sacctmgr_modify_association(int argc, char *argv[]); extern int sacctmgr_modify_user(int argc, char *argv[]); extern int sacctmgr_modify_account(int argc, char *argv[]); extern int sacctmgr_modify_cluster(int argc, char *argv[]); +extern int sacctmgr_modify_qos(int argc, char *argv[]); extern int sacctmgr_delete_association(int argc, char *argv[]); extern int sacctmgr_delete_user(int argc, char *argv[]); diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index 115dbd88f5f..a34eacaffc0 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -2092,6 +2092,9 @@ static int _modify_qos(slurmdbd_conn_t *slurmdbd_conn, } else if(errno == SLURM_NO_CHANGE_IN_DATA) { comment = "Request didn't affect anything"; rc = SLURM_SUCCESS; + } else if(errno == ESLURM_QOS_PREEMPTION_LOOP) { + comment = "QOS Preemption loop detected"; + rc = ESLURM_QOS_PREEMPTION_LOOP; } else { comment = "Unknown issue"; rc = SLURM_ERROR; diff --git a/src/slurmdbd/slurmdbd.c b/src/slurmdbd/slurmdbd.c index 1beabaabf55..f5d8f53eaaa 100644 --- a/src/slurmdbd/slurmdbd.c +++ b/src/slurmdbd/slurmdbd.c @@ -140,9 +140,9 @@ int main(int argc, char *argv[]) memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); - /* If we are tacking wckey we need to cache associations and - wckeys if we aren't only cache the users */ - assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER; + /* If we are tacking wckey we need to cache + wckeys, if we aren't only cache the users and qos */ + assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS; if(slurmdbd_conf->track_wckey) assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY; -- GitLab