From 494086b5847e0b5a61db553f1572d079fa5c3df6 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 23 May 2008 21:12:21 +0000 Subject: [PATCH] added suspend time to the table along with flushing jobs at cold start. Suspend time isn't calculated yet, but we will figure it out --- src/common/assoc_mgr.c | 28 +- src/common/parse_time.c | 3 +- src/common/slurm_accounting_storage.c | 71 ++- src/common/slurm_accounting_storage.h | 11 + src/common/slurmdbd_defs.c | 2 + src/common/slurmdbd_defs.h | 2 + src/database/mysql_common.c | 7 + src/database/mysql_common.h | 1 + .../filetxt/accounting_storage_filetxt.c | 7 + .../mysql/accounting_storage_mysql.c | 477 ++++++++++++++---- .../accounting_storage/mysql/mysql_rollup.c | 322 ++++++++++++ .../accounting_storage/mysql/mysql_rollup.h | 8 + .../none/accounting_storage_none.c | 6 + .../pgsql/accounting_storage_pgsql.c | 11 + .../slurmdbd/accounting_storage_slurmdbd.c | 24 +- src/sacctmgr/account_functions.c | 34 +- src/sacctmgr/cluster_functions.c | 169 ++++--- src/sacctmgr/sacctmgr.c | 2 +- src/sacctmgr/user_functions.c | 34 +- src/slurmctld/controller.c | 7 + src/slurmctld/job_mgr.c | 2 +- src/slurmctld/node_mgr.c | 5 +- src/slurmdbd/proc_req.c | 42 +- 23 files changed, 1041 insertions(+), 234 deletions(-) diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index a4af3636e24..1a35ca43221 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -113,11 +113,13 @@ static int _get_local_association_list(void *db_conn, int enforce) } list_iterator_reset(itr2); } - if(!assoc->user) + if(!assoc->user) { continue; + } passwd_ptr = getpwnam(assoc->user); if(passwd_ptr) assoc->uid = passwd_ptr->pw_uid; + //log_assoc_rec(assoc); } list_iterator_destroy(itr2); list_iterator_destroy(itr); @@ -232,8 +234,10 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, acct_association_rec_t *assoc, if(!assoc->cluster) assoc->cluster = local_cluster_name; } -/* info("looking for assoc of user=%u, acct=%s, cluster=%s, partition=%s", */ -/* assoc->uid, assoc->acct, assoc->cluster, assoc->partition); */ +/* info("looking for assoc of user=%s(%u), acct=%s, " */ +/* "cluster=%s, partition=%s", */ +/* assoc->user, assoc->uid, assoc->acct, */ +/* assoc->cluster, assoc->partition); */ slurm_mutex_lock(&local_association_lock); itr = list_iterator_create(local_association_list); while((found_assoc = list_next(itr))) { @@ -244,12 +248,13 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, acct_association_rec_t *assoc, } continue; } else { - if(!assoc->user && found_assoc->user) { + if(!assoc->uid && found_assoc->uid) { debug3("we are looking for a " "nonuser association"); continue; } else if(assoc->uid != found_assoc->uid) { - debug3("not the right user"); + debug3("not the right user %u != %u", + assoc->uid, found_assoc->uid); continue; } @@ -300,11 +305,17 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, acct_association_rec_t *assoc, assoc->cluster = ret_assoc->cluster; if(!assoc->partition) assoc->partition = ret_assoc->partition; + assoc->fairshare = ret_assoc->fairshare; assoc->max_cpu_secs_per_job = ret_assoc->max_cpu_secs_per_job; + assoc->max_jobs = ret_assoc->max_jobs; assoc->max_nodes_per_job = ret_assoc->max_nodes_per_job; assoc->max_wall_duration_per_job = ret_assoc->max_wall_duration_per_job; - /* The other fields are not relevant to the specific job, - * for example max_jobs */ + assoc->parent_acct_ptr = ret_assoc->parent_acct_ptr; + if(assoc->parent_acct) { + xfree(assoc->parent_acct); + assoc->parent_acct = xstrdup(ret_assoc->parent_acct); + } else + assoc->parent_acct = ret_assoc->parent_acct; slurm_mutex_unlock(&local_association_lock); return SLURM_SUCCESS; @@ -481,7 +492,7 @@ extern int assoc_mgr_update_local_assocs(acct_update_object_t *update) rc = SLURM_ERROR; break; } - debug("updating the assocs here on %u", rec->id); + debug("updating assoc %u", rec->id); if(object->fairshare != (uint32_t)NO_VAL) { rec->fairshare = object->fairshare; } @@ -517,6 +528,7 @@ extern int assoc_mgr_update_local_assocs(acct_update_object_t *update) parents_changed = 1; } + log_assoc_rec(rec); /* FIX ME: do more updates here */ break; case ACCT_ADD_ASSOC: diff --git a/src/common/parse_time.c b/src/common/parse_time.c index 1a7f06ee32e..e72f2c93e45 100644 --- a/src/common/parse_time.c +++ b/src/common/parse_time.c @@ -471,7 +471,8 @@ extern int time_str2mins(char *string) if ((string == NULL) || (string[0] == '\0')) return -1; /* invalid input */ - if ((!strcasecmp(string, "INFINITE")) || + if ((!strcasecmp(string, "-1")) || + (!strcasecmp(string, "INFINITE")) || (!strcasecmp(string, "UNLIMITED"))) { return INFINITE; } diff --git a/src/common/slurm_accounting_storage.c b/src/common/slurm_accounting_storage.c index 9d0bbe8858d..59879e61592 100644 --- a/src/common/slurm_accounting_storage.c +++ b/src/common/slurm_accounting_storage.c @@ -142,6 +142,9 @@ typedef struct slurm_acct_storage_ops { List selected_parts, void *params); int (*update_shares_used) (void *db_conn, List shares_used); + int (*flush_jobs) (void *db_conn, + char *cluster, + time_t event_time); } slurm_acct_storage_ops_t; typedef struct slurm_acct_storage_context { @@ -211,7 +214,8 @@ static slurm_acct_storage_ops_t * _acct_storage_get_ops( "jobacct_storage_p_suspend", "jobacct_storage_p_get_jobs", "jobacct_storage_p_archive", - "acct_storage_p_update_shares_used" + "acct_storage_p_update_shares_used", + "acct_storage_p_flush_jobs_on_cluster" }; int n_syms = sizeof( syms ) / sizeof( char * ); @@ -1561,20 +1565,39 @@ extern acct_admin_level_t str_2_acct_admin_level(char *level) extern void log_assoc_rec(acct_association_rec_t *assoc_ptr) { - info("association rec id: %u", assoc_ptr->id); - info(" acct : %s", assoc_ptr->acct); - info(" cluster : %s", assoc_ptr->cluster); - info(" fairshare : %u", assoc_ptr->fairshare); - info(" max_cpu_secs_per_job : %u", assoc_ptr->max_cpu_secs_per_job); - info(" max_jobs : %u", assoc_ptr->max_jobs); - info(" max_nodes_per_job : %u", assoc_ptr->max_nodes_per_job); - info(" max_wall_duration_per_job : %u", - assoc_ptr->max_wall_duration_per_job); - info(" parent_acct : %s", assoc_ptr->parent_acct); - info(" partition : %s", assoc_ptr->partition); - info(" user : %s(%u)", assoc_ptr->user, assoc_ptr->uid); - info(" used_jobs : %u", assoc_ptr->used_jobs); - info(" used_share : %u", assoc_ptr->used_share); + debug("association rec id : %u", assoc_ptr->id); + debug(" acct : %s", assoc_ptr->acct); + debug(" cluster : %s", assoc_ptr->cluster); + if(assoc_ptr->fairshare == INFINITE) + debug(" fairshare : NONE"); + else + debug(" fairshare : %u", + assoc_ptr->fairshare); + if(assoc_ptr->max_cpu_secs_per_job == INFINITE) + debug(" max_cpu_secs_per_job : NONE"); + else + debug(" max_cpu_secs_per_job : %d", + assoc_ptr->max_cpu_secs_per_job); + if(assoc_ptr->max_jobs == INFINITE) + debug(" max_jobs : NONE"); + else + debug(" max_jobs : %u", assoc_ptr->max_jobs); + if(assoc_ptr->max_nodes_per_job == INFINITE) + debug(" max_nodes_per_job : NONE"); + else + debug(" max_nodes_per_job : %d", + assoc_ptr->max_nodes_per_job); + if(assoc_ptr->max_wall_duration_per_job == INFINITE) + debug(" max_wall_duration_per_job : NONE"); + else + debug(" max_wall_duration_per_job : %d", + assoc_ptr->max_wall_duration_per_job); + debug(" parent_acct : %s", assoc_ptr->parent_acct); + debug(" partition : %s", assoc_ptr->partition); + debug(" user : %s(%u)", + assoc_ptr->user, assoc_ptr->uid); + debug(" used_jobs : %u", assoc_ptr->used_jobs); + debug(" used_share : %u", assoc_ptr->used_share); } /* @@ -1984,3 +2007,21 @@ extern int acct_storage_g_update_shares_used(void *db_conn, List acct_list) acct_list); } +/* + * This should be called when a cluster does a cold start to flush out + * any jobs that were running during the restart so we don't have any + * jobs in the database "running" forever since no endtime will be + * placed in there other wise. + * IN: char * = cluster name + * RET: SLURM_SUCCESS on success SLURM_ERROR else + */ +extern int acct_storage_g_flush_jobs_on_cluster( + void *db_conn, char *cluster, time_t event_time) +{ + if (slurm_acct_storage_init(NULL) < 0) + return SLURM_ERROR; + return (*(g_acct_storage_context->ops.flush_jobs)) + (db_conn, cluster, event_time); + +} + diff --git a/src/common/slurm_accounting_storage.h b/src/common/slurm_accounting_storage.h index e737bd35ae3..1aec1b9f619 100644 --- a/src/common/slurm_accounting_storage.h +++ b/src/common/slurm_accounting_storage.h @@ -468,6 +468,17 @@ extern int acct_storage_g_roll_usage(void *db_conn); */ extern int acct_storage_g_update_shares_used(void *db_conn, List acct_list); +/* + * This should be called when a cluster does a cold start to flush out + * any jobs that were running during the restart so we don't have any + * jobs in the database "running" forever since no endtime will be + * placed in there other wise. + * IN: char * = cluster name + * RET: SLURM_SUCCESS on success SLURM_ERROR else + */ +extern int acct_storage_g_flush_jobs_on_cluster( + void *db_conn, char *cluster, time_t event_time); + /*********************** CLUSTER ACCOUNTING STORAGE **************************/ extern int clusteracct_storage_g_node_down(void *db_conn, diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 0fcd2eec408..9b61450b182 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -353,6 +353,7 @@ extern Buf pack_slurmdbd_msg(slurmdbd_msg_t *req) buffer); break; case DBD_CLUSTER_PROCS: + case DBD_FLUSH_JOBS: slurmdbd_pack_cluster_procs_msg( (dbd_cluster_procs_msg_t *)req->data, buffer); break; @@ -467,6 +468,7 @@ extern int unpack_slurmdbd_msg(slurmdbd_msg_t *resp, Buf buffer) (dbd_acct_coord_msg_t **)&resp->data, buffer); break; case DBD_CLUSTER_PROCS: + case DBD_FLUSH_JOBS: rc = slurmdbd_unpack_cluster_procs_msg( (dbd_cluster_procs_msg_t **)&resp->data, buffer); break; diff --git a/src/common/slurmdbd_defs.h b/src/common/slurmdbd_defs.h index 0acd5435c43..f11647fd2a2 100644 --- a/src/common/slurmdbd_defs.h +++ b/src/common/slurmdbd_defs.h @@ -70,6 +70,8 @@ typedef enum { DBD_ADD_CLUSTERS, /* Add new cluster to the mix */ DBD_ADD_USERS, /* Add new user to the mix */ DBD_CLUSTER_PROCS, /* Record total processors on cluster */ + DBD_FLUSH_JOBS, /* End jobs that are still running + * when a controller is restarted. */ DBD_GET_ACCOUNTS, /* Get account information */ DBD_GET_ASSOCS, /* Get assocation information */ DBD_GET_ASSOC_USAGE, /* Get assoc usage information */ diff --git a/src/database/mysql_common.c b/src/database/mysql_common.c index 2a159c980c4..24e4de9ec8f 100644 --- a/src/database/mysql_common.c +++ b/src/database/mysql_common.c @@ -304,6 +304,13 @@ extern int mysql_db_query(MYSQL *mysql_db, char *query) return SLURM_SUCCESS; } +extern int mysql_db_ping(MYSQL *mysql_db) +{ + /* clear out the old results so we don't get a 2014 error */ + _clear_results(mysql_db); + return mysql_ping(mysql_db); +} + extern int mysql_db_commit(MYSQL *mysql_db) { //slurm_mutex_lock(&mysql_lock); diff --git a/src/database/mysql_common.h b/src/database/mysql_common.h index 0f2ca83d9c8..69acb54c499 100644 --- a/src/database/mysql_common.h +++ b/src/database/mysql_common.h @@ -81,6 +81,7 @@ extern int mysql_get_db_connection(MYSQL **mysql_db, char *db_name, extern int mysql_close_db_connection(MYSQL **mysql_db); extern int mysql_cleanup(); extern int mysql_db_query(MYSQL *mysql_db, char *query); +extern int mysql_db_ping(MYSQL *mysql_db); extern int mysql_db_commit(MYSQL *mysql_db); extern int mysql_db_rollback(MYSQL *mysql_db); diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c index 1be52a9415e..458305e457d 100644 --- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c +++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c @@ -820,3 +820,10 @@ extern int acct_storage_p_update_shares_used(void *db_conn, { return SLURM_SUCCESS; } + +extern int acct_storage_p_flush_jobs_on_cluster( + void *db_conn, char *cluster, time_t event_time) +{ + /* put end times for a clean start */ + return SLURM_SUCCESS; +} diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 490216118f9..b367d1df1f8 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -110,6 +110,7 @@ char *step_table = "step_table"; char *txn_table = "txn_table"; char *user_table = "user_table"; char *last_ran_table = "last_ran_table"; +char *suspend_table = "suspend_table"; extern int acct_storage_p_commit(mysql_conn_t *mysql_conn, bool commit); @@ -875,7 +876,6 @@ static int _mysql_acct_check_tables(MYSQL *acct_mysql_db) { "deleted", "tinyint default 0" }, { "id", "int not null" }, { "period_start", "int unsigned not null" }, - { "cpu_count", "int unsigned default 0" }, { "alloc_cpu_secs", "bigint default 0" }, { NULL, NULL} }; @@ -983,6 +983,14 @@ static int _mysql_acct_check_tables(MYSQL *acct_mysql_db) { NULL, NULL} }; + storage_field_t suspend_table_fields[] = { + { "id", "int not null" }, + { "associd", "mediumint not null" }, + { "start", "int unsigned default 0 not null" }, + { "end", "int unsigned default 0 not null" }, + { NULL, NULL} + }; + storage_field_t txn_table_fields[] = { { "id", "int not null auto_increment" }, { "timestamp", "int unsigned default 0 not null" }, @@ -1124,6 +1132,11 @@ static int _mysql_acct_check_tables(MYSQL *acct_mysql_db) ", primary key (id, stepid))") == SLURM_ERROR) return SLURM_ERROR; + if(mysql_db_create_table(acct_mysql_db, suspend_table, + suspend_table_fields, + ")") == SLURM_ERROR) + return SLURM_ERROR; + if(mysql_db_create_table(acct_mysql_db, txn_table, txn_table_fields, ", primary key (id))") == SLURM_ERROR) return SLURM_ERROR; @@ -1259,12 +1272,21 @@ extern int acct_storage_p_close_connection(mysql_conn_t **mysql_conn) extern int acct_storage_p_commit(mysql_conn_t *mysql_conn, bool commit) { #ifdef HAVE_MYSQL - - if(!mysql_conn) + + if(!mysql_conn) { + error("We need a connection to run this"); return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } debug4("got %d commits", list_count(mysql_conn->update_list)); - + if(mysql_conn->rollback) { if(!commit) { if(mysql_db_rollback(mysql_conn->acct_mysql_db)) @@ -1302,7 +1324,7 @@ extern int acct_storage_p_commit(mysql_conn_t *mysql_conn, bool commit) } xfree(query); while((row = mysql_fetch_row(result))) { - //info("sending to %s(%s)", row[0], row[1]); + info("sending to %s(%s)", row[0], row[1]); slurm_set_addr_char(&req.address, atoi(row[1]), row[0]); req.msg_type = ACCOUNTING_UPDATE_MSG; req.flags = SLURM_GLOBAL_AUTH_KEY; @@ -1390,6 +1412,18 @@ extern int acct_storage_p_add_users(mysql_conn_t *mysql_conn, uint32_t uid, int affect_rows = 0; List assoc_list = list_create(destroy_acct_association_rec); + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -1521,6 +1555,18 @@ extern int acct_storage_p_add_accts(mysql_conn_t *mysql_conn, uint32_t uid, int affect_rows = 0; List assoc_list = list_create(destroy_acct_association_rec); + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -1638,6 +1684,18 @@ extern int acct_storage_p_add_clusters(mysql_conn_t *mysql_conn, uint32_t uid, char *user = NULL; int affect_rows = 0; + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -1833,6 +1891,18 @@ extern int acct_storage_p_add_associations(mysql_conn_t *mysql_conn, return SLURM_ERROR; } + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -2199,6 +2269,16 @@ extern List acct_storage_p_modify_users(mysql_conn_t *mysql_conn, uint32_t uid, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) + return NULL; + } + if((pw=getpwuid(uid))) { user_name = pw->pw_name; } @@ -2322,6 +2402,18 @@ extern List acct_storage_p_modify_accts(mysql_conn_t *mysql_conn, uint32_t uid, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -2391,6 +2483,7 @@ extern List acct_storage_p_modify_accts(mysql_conn_t *mysql_conn, uint32_t uid, if(!(result = mysql_db_query_ret( mysql_conn->acct_mysql_db, query, 0))) { xfree(query); + xfree(vals); return NULL; } xfree(query); @@ -2444,7 +2537,7 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, List ret_list = NULL; int rc = SLURM_SUCCESS; char *object = NULL; - char *vals = NULL, *assoc_vals = NULL, *extra = NULL, *query = NULL, + char *vals = NULL, *extra = NULL, *query = NULL, *name_char = NULL, *assoc_char= NULL, *send_char = NULL; time_t now = time(NULL); struct passwd *pw = NULL; @@ -2453,11 +2546,28 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, MYSQL_RES *result = NULL; MYSQL_ROW row; + /* If you need to alter the default values of the cluster use + * modify_associations since this is used only for registering + * the controller when it loads + */ + if(!cluster_q) { error("we need something to change"); return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -2485,40 +2595,7 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, xstrfmtcat(vals, ", control_port=%u", cluster->control_port); } - if((int)cluster->default_fairshare >= 0) { - xstrfmtcat(assoc_vals, ", fairshare=%u", - cluster->default_fairshare); - } else if((int)cluster->default_fairshare == -1) - xstrfmtcat(assoc_vals, ", fairshare=1"); - - if((int)cluster->default_max_cpu_secs_per_job >= 0) { - xstrfmtcat(assoc_vals, ", max_cpu_secs_per_job=%u", - cluster->default_max_cpu_secs_per_job); - } else if((int)cluster->default_max_cpu_secs_per_job == -1) - xstrfmtcat(assoc_vals, ", max_cpu_secs_per_job=NULL"); - - if((int)cluster->default_max_jobs >= 0) { - xstrfmtcat(assoc_vals, ", max_jobs=%u", - cluster->default_max_jobs); - } else if((int)cluster->default_max_jobs == -1) - xstrfmtcat(assoc_vals, ", max_jobs=NULL"); - - - if((int)cluster->default_max_nodes_per_job >= 0) { - xstrfmtcat(assoc_vals, ", max_nodes_per_job=%u", - cluster->default_max_nodes_per_job); - } else if((int)cluster->default_max_nodes_per_job == -1) - xstrfmtcat(assoc_vals, ", max_nodes_per_job=NULL"); - - - if((int)cluster->default_max_wall_duration_per_job >= 0) { - xstrfmtcat(assoc_vals, ", max_wall_duration_per_job=%u", - cluster->default_max_wall_duration_per_job); - } else if((int)cluster->default_max_wall_duration_per_job == -1) - xstrfmtcat(assoc_vals, ", max_wall_duration_per_job=NULL"); - - - if(!vals && !assoc_vals) { + if(!vals) { error("Nothing to change"); return NULL; } @@ -2530,7 +2607,6 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, mysql_conn->acct_mysql_db, query, 0))) { xfree(query); xfree(vals); - xfree(assoc_vals); error("no result given for %s", extra); return NULL; } @@ -2539,42 +2615,19 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, rc = 0; ret_list = list_create(slurm_destroy_char); while((row = mysql_fetch_row(result))) { - acct_association_rec_t *assoc = NULL; - object = xstrdup(row[0]); list_append(ret_list, object); if(!rc) { xstrfmtcat(name_char, "name='%s'", object); - xstrfmtcat(assoc_char, "cluster='%s'", object); rc = 1; } else { xstrfmtcat(name_char, " || name='%s'", object); - xstrfmtcat(assoc_char, " || cluster='%s'", object); - } - if(assoc_vals) { - assoc = xmalloc(sizeof(acct_association_rec_t)); - assoc->cluster = xstrdup(object); - assoc->acct = xstrdup("root"); - assoc->fairshare = cluster->default_fairshare; - assoc->max_jobs = cluster->default_max_jobs; - assoc->max_nodes_per_job = - cluster->default_max_nodes_per_job; - assoc->max_wall_duration_per_job = - cluster->default_max_wall_duration_per_job; - assoc->max_cpu_secs_per_job = - cluster->default_max_cpu_secs_per_job; - - if(_addto_update_list(mysql_conn->update_list, - ACCT_MODIFY_ASSOC, - assoc) != SLURM_SUCCESS) - error("couldn't add to the update list"); } } mysql_free_result(result); if(!list_count(ret_list)) { debug3("didn't effect anything"); - xfree(assoc_vals); xfree(vals); return ret_list; } @@ -2591,23 +2644,10 @@ extern List acct_storage_p_modify_clusters(mysql_conn_t *mysql_conn, } } - if(assoc_vals) { - send_char = xstrdup_printf("acct='root' && (%s)", - assoc_char); - if(_modify_common(mysql_conn, DBD_MODIFY_CLUSTERS, now, - user, assoc_table, send_char, assoc_vals) - == SLURM_ERROR) { - error("Couldn't modify cluster"); - list_destroy(ret_list); - ret_list = NULL; - goto end_it; - } - } end_it: xfree(name_char); xfree(assoc_char); xfree(vals); - xfree(assoc_vals); xfree(send_char); return ret_list; @@ -2661,6 +2701,18 @@ extern List acct_storage_p_modify_associations(mysql_conn_t *mysql_conn, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + if((pw=getpwuid(uid))) { user = pw->pw_name; } @@ -2934,6 +2986,18 @@ extern List acct_storage_p_remove_users(mysql_conn_t *mysql_conn, uint32_t uid, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + if((pw=getpwuid(uid))) { user_name = pw->pw_name; } @@ -3033,7 +3097,7 @@ extern List acct_storage_p_remove_coord(mysql_conn_t *mysql_conn, uint32_t uid, char *acct, acct_user_cond_t *user_q) { #ifdef HAVE_MYSQL - return SLURM_SUCCESS; + return NULL; #else return NULL; #endif @@ -3065,6 +3129,18 @@ extern List acct_storage_p_remove_accts(mysql_conn_t *mysql_conn, uint32_t uid, user_name = pw->pw_name; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where deleted=0"); if(acct_q->acct_list && list_count(acct_q->acct_list)) { set = 0; @@ -3188,6 +3264,18 @@ extern List acct_storage_p_remove_clusters(mysql_conn_t *mysql_conn, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + if((pw=getpwuid(uid))) { user_name = pw->pw_name; } @@ -3336,6 +3424,18 @@ extern List acct_storage_p_remove_associations(mysql_conn_t *mysql_conn, return NULL; } + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where id>0 && deleted=0"); if((pw=getpwuid(uid))) { @@ -3547,6 +3647,18 @@ extern List acct_storage_p_get_users(mysql_conn_t *mysql_conn, USER_REQ_COUNT }; + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where deleted=0"); if(!user_q) @@ -3712,6 +3824,18 @@ extern List acct_storage_p_get_accts(mysql_conn_t *mysql_conn, ACCT_REQ_COUNT }; + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where deleted=0"); if(!acct_q) goto empty; @@ -3885,6 +4009,18 @@ extern List acct_storage_p_get_clusters(mysql_conn_t *mysql_conn, ASSOC_REQ_COUNT }; + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where deleted=0"); if(!cluster_q) @@ -4053,6 +4189,18 @@ extern List acct_storage_p_get_associations(mysql_conn_t *mysql_conn, ASSOC2_REQ_MCPJ }; + if(!mysql_conn) { + error("We need a connection to run this"); + return NULL; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return NULL; + } + } + xstrcat(extra, "where deleted=0"); if(!assoc_q) goto empty; @@ -4307,6 +4455,18 @@ extern int acct_storage_p_roll_usage(mysql_conn_t *mysql_conn) UPDATE_COUNT }; + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + i=0; xstrfmtcat(tmp, "%s", update_req_inx[i]); for(i=1; i<UPDATE_COUNT; i++) { @@ -4344,10 +4504,13 @@ extern int acct_storage_p_roll_usage(mysql_conn_t *mysql_conn) if(rc == SLURM_ERROR) return rc; } - last_hour = 1211403599; + last_hour = 1211475599; + last_day = 1211475599; + last_month = 1211475599; +// last_hour = 1211403599; // last_hour = 1206946800; - last_day = 1207033199; - last_month = 1204358399; +// last_day = 1207033199; +// last_month = 1204358399; if(!localtime_r(&last_hour, &start_tm)) { error("Couldn't get localtime from hour start %d", last_hour); @@ -4469,6 +4632,18 @@ extern int clusteracct_storage_p_node_down(mysql_conn_t *mysql_conn, char *query = NULL; char *my_reason; + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + if (slurmctld_conf.fast_schedule && !slurmdbd_conf) cpus = node_ptr->config_ptr->cpus; else @@ -4508,6 +4683,18 @@ extern int clusteracct_storage_p_node_up(mysql_conn_t *mysql_conn, char* query; int rc = SLURM_SUCCESS; + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + query = xstrdup_printf( "update %s set period_end=%d where cluster='%s' " "and period_end=0 and node_name='%s';", @@ -4545,6 +4732,18 @@ extern int clusteracct_storage_p_cluster_procs(mysql_conn_t *mysql_conn, } last_procs = procs; + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + /* Record the processor count */ query = xstrdup_printf( "select cpu_count from %s where cluster='%s' " @@ -4631,13 +4830,14 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return SLURM_ERROR; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return SLURM_ERROR; + } } - debug2("mysql_jobacct_job_start() called"); priority = (job_ptr->priority == NO_VAL) ? -1L : (long) job_ptr->priority; @@ -4758,10 +4958,12 @@ extern int jobacct_storage_p_job_complete(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return SLURM_ERROR; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return SLURM_ERROR; + } } debug2("mysql_jobacct_job_complete() called"); if (job_ptr->end_time == 0) { @@ -4826,10 +5028,12 @@ extern int jobacct_storage_p_step_start(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return SLURM_ERROR; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return SLURM_ERROR; + } } if(slurmdbd_conf) { cpus = step_ptr->job_ptr->total_procs; @@ -4931,10 +5135,12 @@ extern int jobacct_storage_p_step_complete(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return SLURM_ERROR; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return SLURM_ERROR; + } } if(slurmdbd_conf) { @@ -5048,17 +5254,20 @@ extern int jobacct_storage_p_suspend(mysql_conn_t *mysql_conn, struct job_record *job_ptr) { #ifdef HAVE_MYSQL - char query[1024]; + char *query = NULL; int rc = SLURM_SUCCESS; - + bool suspended = false; + if(!mysql_conn) { error("We need a connection to run this"); return SLURM_ERROR; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return SLURM_ERROR; + } } if(!job_ptr->db_index) { job_ptr->db_index = _get_db_index(mysql_conn->acct_mysql_db, @@ -5069,20 +5278,38 @@ extern int jobacct_storage_p_suspend(mysql_conn_t *mysql_conn, return SLURM_ERROR; } - snprintf(query, sizeof(query), - "update %s set suspended=%u-suspended, state=%d " - "where id=%u", - job_table, (int)job_ptr->suspend_time, - job_ptr->job_state & (~JOB_COMPLETING), - job_ptr->db_index); + if (job_ptr->job_state == JOB_SUSPENDED) + suspended = true; + + xstrfmtcat(query, + "update %s set suspended=%d-suspended, state=%d " + "where id=%u;", + job_table, (int)job_ptr->suspend_time, + job_ptr->job_state & (~JOB_COMPLETING), + job_ptr->db_index); + if(suspended) + xstrfmtcat(query, + "insert into %s (id, associd, start, end) " + "values (%u, %u, %d, 0);", + suspend_table, job_ptr->assoc_id, job_ptr->db_index, + (int)job_ptr->suspend_time); + else + xstrfmtcat(query, + "update %s set end=%d where id=%u && end=0;", + suspend_table, (int)job_ptr->suspend_time, + job_ptr->db_index); + rc = mysql_db_query(mysql_conn->acct_mysql_db, query); + + xfree(query); if(rc != SLURM_ERROR) { - snprintf(query, sizeof(query), - "update %s set suspended=%u-suspended, " - "state=%d where id=%u and end=0", - step_table, (int)job_ptr->suspend_time, - job_ptr->job_state, job_ptr->db_index); + xstrfmtcat(query, + "update %s set suspended=%u-suspended, " + "state=%d where id=%u and end=0", + step_table, (int)job_ptr->suspend_time, + job_ptr->job_state, job_ptr->db_index); rc = mysql_db_query(mysql_conn->acct_mysql_db, query); + xfree(query); } return rc; @@ -5107,10 +5334,12 @@ extern List jobacct_storage_p_get_jobs(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return NULL; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info)) + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); return NULL; + } } job_list = mysql_jobacct_process_get_jobs(mysql_conn, selected_steps, @@ -5132,7 +5361,7 @@ extern void jobacct_storage_p_archive(mysql_conn_t *mysql_conn, error("We need a connection to run this"); return; } else if(!mysql_conn->acct_mysql_db - || mysql_ping(mysql_conn->acct_mysql_db) != 0) { + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, mysql_db_name, mysql_db_info)) return; @@ -5143,10 +5372,40 @@ extern void jobacct_storage_p_archive(mysql_conn_t *mysql_conn, return; } -extern int acct_storage_p_update_shares_used(void *db_conn, +extern int acct_storage_p_update_shares_used(mysql_conn_t *mysql_conn, List shares_used) { /* This definitely needs to be fleshed out. * Go through the list of shares_used_object_t objects and store them */ return SLURM_SUCCESS; } + +extern int acct_storage_p_flush_jobs_on_cluster( + mysql_conn_t *mysql_conn, char *cluster, time_t event_time) +{ + /* put end times for a clean start */ + char *query = NULL; + int rc = SLURM_SUCCESS; + + if(!mysql_conn) { + error("We need a connection to run this"); + return SLURM_ERROR; + } else if(!mysql_conn->acct_mysql_db + || mysql_db_ping(mysql_conn->acct_mysql_db) != 0) { + if(!mysql_get_db_connection(&mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info)) { + error("unable to re-connect to mysql database"); + return SLURM_ERROR; + } + } + + query = xstrdup_printf("update %s as t1, %s as t2 set t1.end=%u where " + "t2.id=t1.associd and t2.cluster='%s' " + "&& t1.end=0;", + job_table, assoc_table, event_time, cluster); + + rc = mysql_db_query(mysql_conn->acct_mysql_db, query); + xfree(query); + + return rc; +} diff --git a/src/plugins/accounting_storage/mysql/mysql_rollup.c b/src/plugins/accounting_storage/mysql/mysql_rollup.c index aeda3153dd9..452d1bb9aa2 100644 --- a/src/plugins/accounting_storage/mysql/mysql_rollup.c +++ b/src/plugins/accounting_storage/mysql/mysql_rollup.c @@ -42,21 +42,343 @@ #ifdef HAVE_MYSQL +typedef struct { + int assoc_id; + int a_cpu; +} local_assoc_usage_t; + +typedef struct { + char *name; + int a_cpu; + int cpu_count; + int d_cpu; + int i_cpu; + int r_cpu; + time_t start; + time_t end; +} local_cluster_usage_t; + + +extern void _destroy_local_assoc_usage(void *object) +{ + local_assoc_usage_t *a_usage = (local_assoc_usage_t *)object; + if(a_usage) { + xfree(a_usage); + } +} + +extern void _destroy_local_cluster_usage(void *object) +{ + local_cluster_usage_t *c_usage = (local_cluster_usage_t *)object; + if(c_usage) { + xfree(c_usage->name); + xfree(c_usage); + } +} + extern int mysql_hourly_rollup(mysql_conn_t *mysql_conn, time_t start, time_t end) { int add_sec = 3599; + int i=0; time_t curr_start = start; time_t curr_end = curr_start + add_sec; + char *query = NULL; + MYSQL_RES *result = NULL; + MYSQL_ROW row; + ListIterator a_itr = NULL; + ListIterator c_itr = NULL; + List assoc_usage_list = list_create(_destroy_local_assoc_usage); + List cluster_usage_list = list_create(_destroy_local_cluster_usage); + char *event_req_inx[] = { + "node_name", + "cluster", + "cpu_count", + "period_start", + "period_end" + }; + char *event_str = NULL; + enum { + EVENT_REQ_NAME, + EVENT_REQ_CLUSTER, + EVENT_REQ_CPU, + EVENT_REQ_START, + EVENT_REQ_END, + EVENT_REQ_COUNT + }; + char *job_req_inx[] = { + "jobid", + "associd", + "cluster", + "eligible", + "start", + "end", + "suspended", + "alloc_cpus", + "req_cpus" + }; + char *job_str = NULL; + enum { + JOB_REQ_JOBID, + JOB_REQ_ASSOCID, + JOB_REQ_CLUSTER, + JOB_REQ_ELG, + JOB_REQ_START, + JOB_REQ_END, + JOB_REQ_SUSPENDED, + JOB_REQ_ACPU, + JOB_REQ_RCPU, + JOB_REQ_COUNT + }; + + i=0; + xstrfmtcat(event_str, "%s", event_req_inx[i]); + for(i=1; i<EVENT_REQ_COUNT; i++) { + xstrfmtcat(event_str, ", %s", event_req_inx[i]); + } + + i=0; + xstrfmtcat(job_str, "%s", job_req_inx[i]); + for(i=1; i<JOB_REQ_COUNT; i++) { + xstrfmtcat(job_str, ", %s", job_req_inx[i]); + } /* info("begin start %s", ctime(&curr_start)); */ /* info("begin end %s", ctime(&curr_end)); */ + a_itr = list_iterator_create(cluster_usage_list); + c_itr = list_iterator_create(cluster_usage_list); while(curr_start < end) { + int last_id = 0; + local_cluster_usage_t *c_usage = NULL; + local_assoc_usage_t *a_usage = NULL; + + // first get the events during this time + query = xstrdup_printf("select %s from %s where " + "(period_start <= %d " + "&& period_end >= %d) " + "|| period_end = 0 " + "order by node_name, period_start", + event_str, event_table, + curr_end, curr_start); + + debug3("%d query\n%s", mysql_conn->conn, query); + if(!(result = mysql_db_query_ret( + mysql_conn->acct_mysql_db, query, 0))) { + xfree(query); + return SLURM_ERROR; + } + xfree(query); + + while((row = mysql_fetch_row(result))) { + int row_start = atoi(row[EVENT_REQ_START]); + int row_end = atoi(row[EVENT_REQ_END]); + int row_cpu = atoi(row[EVENT_REQ_CPU]); + + if(row_start < curr_start) + row_start = curr_start; + + if(!row_end) + row_end = curr_end; + if(!row[EVENT_REQ_NAME][0]) { + c_usage = + xmalloc(sizeof(local_cluster_usage_t)); + c_usage->name = xstrdup(row[EVENT_REQ_CLUSTER]); + c_usage->cpu_count = row_cpu; + c_usage->start = row_start; + c_usage->end = row_end; + list_append(cluster_usage_list, c_usage); + continue; + } + + list_iterator_reset(c_itr); + while((c_usage = list_next(c_itr))) { + if(!strcmp(c_usage->name, + row[EVENT_REQ_CLUSTER])) { + int local_start = row_start; + int local_end = row_end; + if(c_usage->start > local_start) + local_start = c_usage->start; + if(c_usage->end < local_end) + local_end = c_usage->end; + + if((local_end - local_start) < 1) + continue; + + info("node %s adds (%d)(%d-%d) * %d = %d " + "to %d", + row[EVENT_REQ_NAME], + (local_end - local_start)+1, + local_end, local_start, + row_cpu, + ((local_end - local_start)+1) + * row_cpu, + row_cpu); + /* need to add 1 sec to the + subtraction to get the + total time */ + c_usage->d_cpu += + ((local_end - local_start) + 1) + * row_cpu; + + /* don't break here just + incase the cpu count changed during + this time period. + */ + } + } + } + mysql_free_result(result); + + query = xstrdup_printf("select %s from %s, %s as t2 where " + "((eligible <= %d && end >= %d) " + "|| end = 0 || start = 0) " + "&& associd=t2.id " + "order by associd, eligible", + job_str, job_table, assoc_table, + curr_end, curr_start); + + debug3("%d query\n%s", mysql_conn->conn, query); + if(!(result = mysql_db_query_ret( + mysql_conn->acct_mysql_db, query, 0))) { + xfree(query); + return SLURM_ERROR; + } + xfree(query); + + + while((row = mysql_fetch_row(result))) { + int job_id = atoi(row[JOB_REQ_ASSOCID]); + int assoc_id = atoi(row[JOB_REQ_ASSOCID]); + int row_eligible = atoi(row[JOB_REQ_ELG]); + int row_start = atoi(row[JOB_REQ_START]); + int row_end = atoi(row[JOB_REQ_END]); + int row_acpu = atoi(row[JOB_REQ_ACPU]); + int row_rcpu = atoi(row[JOB_REQ_RCPU]); + + if(row_start && (row_start < curr_start)) + row_start = curr_start; + if(!row_start && row_end) + row_start = row_end; + if(!row_end) + row_end = curr_end; + + if(last_id != assoc_id) { + a_usage = + xmalloc(sizeof(local_cluster_usage_t)); + a_usage->assoc_id = assoc_id; + list_append(assoc_usage_list, a_usage); + last_id = assoc_id; + } + + if(row_start) { + a_usage->a_cpu += + (row_end - row_start) + * row_acpu; + } + list_iterator_reset(c_itr); + while((c_usage = list_next(c_itr))) { + if(!strcmp(c_usage->name, + row[JOB_REQ_CLUSTER])) { + int local_start = row_start; + int local_end = row_end; + if(!local_start) + goto calc_resv; + + if(c_usage->start > local_start) + local_start = c_usage->start; + if(c_usage->end < local_end) + local_end = c_usage->end; + + if((local_end - local_start) < 1) + goto calc_resv; + info("%d assoc %d adds (%d)(%d-%d) * %d = %d " + "to %d", + job_id, + assoc_id, + local_end - local_start, + local_end, local_start, + row_acpu, + (local_end - local_start) + * row_acpu, + row_acpu); + c_usage->a_cpu += + (local_end - local_start) + * row_acpu; + calc_resv: + /* now reserved time */ + if(row_start < c_usage->start) + continue; + local_start = row_eligible; + local_end = row_start; + if(c_usage->start > local_start) + local_start = c_usage->start; + if(c_usage->end < local_end) + local_end = c_usage->end; + + if((local_end - local_start) < 1) + continue; + + info("%d assoc %d reserved (%d)(%d-%d) * %d = %d " + "to %d", + job_id, + assoc_id, + (local_end - local_start), + local_end, local_start, + row_rcpu, + (local_end - local_start) + * row_rcpu, + row_rcpu); + c_usage->r_cpu += + (local_end - local_start) + * row_rcpu; + + + /* don't break here just + incase the cpu count changed during + this time period. + */ + } + } + } + mysql_free_result(result); + + list_iterator_reset(c_itr); + while((c_usage = list_next(c_itr))) { + int total_time = ((curr_end - curr_start) + 1) + * c_usage->cpu_count; + + c_usage->i_cpu = total_time - c_usage->a_cpu - + c_usage->d_cpu - c_usage->r_cpu; + if(c_usage->i_cpu < 0) { + c_usage->r_cpu += c_usage->i_cpu; + c_usage->i_cpu = 0; + if(c_usage->r_cpu < 0) + c_usage->r_cpu = 0; + } + + info("cluster %s(%u) down %u alloc %u " + "resv %u idle %u total= %u from %s", c_usage->name, + c_usage->cpu_count, c_usage->d_cpu, c_usage->a_cpu, + c_usage->r_cpu, c_usage->i_cpu, + c_usage->d_cpu + c_usage->a_cpu + + c_usage->r_cpu + c_usage->i_cpu, + ctime(&c_usage->start)); + info("to %s", ctime(&c_usage->end)); + } + list_flush(assoc_usage_list); + list_flush(cluster_usage_list); curr_start = curr_end+1; curr_end = curr_start + add_sec; debug3("curr hour is now %d-%d", curr_start, curr_end); } + xfree(event_str); + xfree(job_str); + list_iterator_destroy(a_itr); + list_iterator_destroy(c_itr); + + list_destroy(assoc_usage_list); + list_destroy(cluster_usage_list); /* info("stop start %s", ctime(&curr_start)); */ /* info("stop end %s", ctime(&curr_end)); */ return SLURM_SUCCESS; diff --git a/src/plugins/accounting_storage/mysql/mysql_rollup.h b/src/plugins/accounting_storage/mysql/mysql_rollup.h index d04db2387b1..199e30d802b 100644 --- a/src/plugins/accounting_storage/mysql/mysql_rollup.h +++ b/src/plugins/accounting_storage/mysql/mysql_rollup.h @@ -44,6 +44,14 @@ #include "mysql_jobacct_process.h" #ifdef HAVE_MYSQL +extern char *assoc_table; +extern char *assoc_day_table; +extern char *assoc_hour_table; +extern char *assoc_month_table; +extern char *cluster_day_table; +extern char *cluster_hour_table; +extern char *cluster_month_table; +extern char *event_table; extern int mysql_hourly_rollup(mysql_conn_t *mysql_conn, time_t start, time_t end); diff --git a/src/plugins/accounting_storage/none/accounting_storage_none.c b/src/plugins/accounting_storage/none/accounting_storage_none.c index 29deeafbe2e..24de0dd251d 100644 --- a/src/plugins/accounting_storage/none/accounting_storage_none.c +++ b/src/plugins/accounting_storage/none/accounting_storage_none.c @@ -340,3 +340,9 @@ extern int acct_storage_p_update_shares_used(void *db_conn, { return SLURM_SUCCESS; } + +extern int acct_storage_p_flush_jobs_on_cluster( + void *db_conn, char *cluster, time_t event_time) +{ + return SLURM_SUCCESS; +} diff --git a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c index 6b732283a28..0c7c390ab16 100644 --- a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c +++ b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c @@ -1498,5 +1498,16 @@ extern int acct_storage_p_update_shares_used(void *db_conn, { /* This definitely needs to be fleshed out. * Go through the list of shares_used_object_t objects and store them */ + return SLURM_SUCCESS; +} + +extern int acct_storage_p_flush_jobs_on_cluster( + void *db_conn, char *cluster, time_t event_time) +{ + /* put end times for a clean start */ + + + + return SLURM_SUCCESS; } diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index b8dce12a15a..2ba67e2f42d 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -357,8 +357,8 @@ extern List acct_storage_p_modify_clusters(void *db_conn, uint32_t uid, } extern List acct_storage_p_modify_associations(void *db_conn, uint32_t uid, - acct_association_cond_t *assoc_q, - acct_association_rec_t *assoc) + acct_association_cond_t *assoc_q, + acct_association_rec_t *assoc) { slurmdbd_msg_t req; dbd_modify_msg_t get_msg; @@ -1203,3 +1203,23 @@ extern int acct_storage_p_update_shares_used(void *db_conn, return rc; } + +extern int acct_storage_p_flush_jobs_on_cluster(void *db_conn, char *cluster, + time_t event_time) +{ + slurmdbd_msg_t msg; + dbd_cluster_procs_msg_t req; + + info("Ending any jobs in accounting that were running when controller " + "went down on cluster %s", cluster); + req.cluster_name = cluster; + req.proc_count = 0; + req.event_time = event_time; + msg.msg_type = DBD_FLUSH_JOBS; + msg.data = &req; + + if (slurm_send_slurmdbd_msg(&msg) < 0) + return SLURM_ERROR; + + return SLURM_SUCCESS; +} diff --git a/src/sacctmgr/account_functions.c b/src/sacctmgr/account_functions.c index a09912fe1f8..ed0e3d916b9 100644 --- a/src/sacctmgr/account_functions.c +++ b/src/sacctmgr/account_functions.c @@ -142,13 +142,10 @@ static int _set_rec(int *start, int argc, char *argv[], a_set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { + if (mins != NO_VAL) { assoc->max_wall_duration_per_job = (uint32_t) mins; a_set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - assoc->max_wall_duration_per_job = -1; - a_set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); @@ -297,12 +294,9 @@ extern int sacctmgr_add_account(int argc, char *argv[]) limit_set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { + if (mins != NO_VAL) { max_wall_duration_per_job = (uint32_t) mins; limit_set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - max_wall_duration_per_job = -1; - limit_set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); @@ -561,16 +555,30 @@ extern int sacctmgr_add_account(int argc, char *argv[]) if(limit_set) { printf(" Settings\n"); - if(fairshare != NO_VAL) + if(fairshare == INFINITE) + printf(" Fairshare = NONE\n"); + else if(fairshare != NO_VAL) printf(" Fairshare = %u\n", fairshare); - if(max_cpu_secs_per_job != NO_VAL) + + if(max_cpu_secs_per_job == INFINITE) + printf(" MaxCPUSecs = NONE\n"); + else if(max_cpu_secs_per_job != NO_VAL) printf(" MaxCPUSecs = %u\n", max_cpu_secs_per_job); - if(max_jobs != NO_VAL) + + if(max_jobs == INFINITE) + printf(" MaxJobs = NONE\n"); + else if(max_jobs != NO_VAL) printf(" MaxJobs = %u\n", max_jobs); - if(max_nodes_per_job != NO_VAL) + + if(max_nodes_per_job == INFINITE) + printf(" MaxNodes = NONE\n"); + else if(max_nodes_per_job != NO_VAL) printf(" MaxNodes = %u\n", max_nodes_per_job); - if(max_wall_duration_per_job != NO_VAL) { + + if(max_wall_duration_per_job == INFINITE) + printf(" MaxWall = NONE\n"); + else if(max_wall_duration_per_job != NO_VAL) { char time_buf[32]; mins2time_str((time_t) max_wall_duration_per_job, time_buf, sizeof(time_buf)); diff --git a/src/sacctmgr/cluster_functions.c b/src/sacctmgr/cluster_functions.c index c53ed62de49..8a099cbce44 100644 --- a/src/sacctmgr/cluster_functions.c +++ b/src/sacctmgr/cluster_functions.c @@ -41,7 +41,7 @@ #include "src/sacctmgr/print.h" static int _set_cond(int *start, int argc, char *argv[], - acct_cluster_cond_t *cluster_cond, + List cluster_list, List format_list) { int i; @@ -54,13 +54,13 @@ static int _set_cond(int *start, int argc, char *argv[], i--; break; } else if(!end) { - addto_char_list(cluster_cond->cluster_list, argv[i]); + addto_char_list(cluster_list, argv[i]); set = 1; } else if (strncasecmp (argv[i], "Format", 1) == 0) { if(format_list) addto_char_list(format_list, argv[i]+end); } else if (strncasecmp (argv[i], "Names", 1) == 0) { - addto_char_list(cluster_cond->cluster_list, + addto_char_list(cluster_list, argv[i]+end); set = 1; } else { @@ -74,7 +74,7 @@ static int _set_cond(int *start, int argc, char *argv[], } static int _set_rec(int *start, int argc, char *argv[], - acct_cluster_rec_t *cluster) + acct_association_rec_t *assoc) { int i, mins; int set = 0; @@ -89,34 +89,31 @@ static int _set_rec(int *start, int argc, char *argv[], printf(" Bad format on %s: End your option with " "an '=' sign\n", argv[i]); } else if (strncasecmp (argv[i], "FairShare", 1) == 0) { - if (get_uint(argv[i]+end, &cluster->default_fairshare, + if (get_uint(argv[i]+end, &assoc->fairshare, "FairShare") == SLURM_SUCCESS) set = 1; } else if (strncasecmp (argv[i], "MaxJobs", 4) == 0) { - if (get_uint(argv[i]+end, &cluster->default_max_jobs, + if (get_uint(argv[i]+end, &assoc->max_jobs, "MaxJobs") == SLURM_SUCCESS) set = 1; } else if (strncasecmp (argv[i], "MaxNodes", 4) == 0) { if (get_uint(argv[i]+end, - &cluster->default_max_nodes_per_job, + &assoc->max_nodes_per_job, "MaxNodes") == SLURM_SUCCESS) set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { - cluster->default_max_wall_duration_per_job + if (mins != NO_VAL) { + assoc->max_wall_duration_per_job = (uint32_t) mins; set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - cluster->default_max_wall_duration_per_job = -1; - set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); } } else if (strncasecmp (argv[i], "MaxCPUSecs", 4) == 0) { if (get_uint(argv[i]+end, - &cluster->default_max_cpu_secs_per_job, + &assoc->max_cpu_secs_per_job, "MaxCPUSecs") == SLURM_SUCCESS) set = 1; } else { @@ -154,7 +151,7 @@ extern int sacctmgr_add_cluster(int argc, char *argv[]) } else if (strncasecmp (argv[i], "FairShare", 1) == 0) { fairshare = atoi(argv[i]+end); limit_set = 1; - } else if (strncasecmp (argv[i], "MaxCPUSecs4", 4) == 0) { + } else if (strncasecmp (argv[i], "MaxCPUSecs", 4) == 0) { max_cpu_secs_per_job = atoi(argv[i]+end); limit_set = 1; } else if (strncasecmp (argv[i], "MaxJobs=", 4) == 0) { @@ -165,12 +162,9 @@ extern int sacctmgr_add_cluster(int argc, char *argv[]) limit_set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { + if (mins != NO_VAL) { max_wall_duration_per_job = (uint32_t) mins; limit_set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - max_wall_duration_per_job = -1; - limit_set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); @@ -248,16 +242,30 @@ extern int sacctmgr_add_cluster(int argc, char *argv[]) if(limit_set) { printf(" User Defaults\n"); - if(fairshare != NO_VAL) + if(fairshare == INFINITE) + printf(" Fairshare = NONE\n"); + else if(fairshare != NO_VAL) printf(" Fairshare = %u\n", fairshare); - if(max_cpu_secs_per_job != NO_VAL) + + if(max_cpu_secs_per_job == INFINITE) + printf(" MaxCPUSecs = NONE\n"); + else if(max_cpu_secs_per_job != NO_VAL) printf(" MaxCPUSecs = %u\n", max_cpu_secs_per_job); - if(max_jobs != NO_VAL) + + if(max_jobs == INFINITE) + printf(" MaxJobs = NONE\n"); + else if(max_jobs != NO_VAL) printf(" MaxJobs = %u\n", max_jobs); - if(max_nodes_per_job != NO_VAL) + + if(max_nodes_per_job == INFINITE) + printf(" MaxNodes = NONE\n"); + else if(max_nodes_per_job != NO_VAL) printf(" MaxNodes = %u\n", max_nodes_per_job); - if(max_wall_duration_per_job != NO_VAL) { + + if(max_wall_duration_per_job == INFINITE) + printf(" MaxWall = NONE\n"); + else if(max_wall_duration_per_job != NO_VAL) { char time_buf[32]; mins2time_str((time_t) max_wall_duration_per_job, time_buf, sizeof(time_buf)); @@ -319,7 +327,7 @@ extern int sacctmgr_list_cluster(int argc, char *argv[]) cluster_cond->cluster_list = list_create(slurm_destroy_char); - _set_cond(&i, argc, argv, cluster_cond, format_list); + _set_cond(&i, argc, argv, cluster_cond->cluster_list, format_list); cluster_list = acct_storage_g_get_clusters(db_conn, cluster_cond); destroy_acct_cluster_cond(cluster_cond); @@ -455,47 +463,55 @@ extern int sacctmgr_modify_cluster(int argc, char *argv[]) { int rc = SLURM_SUCCESS; int i=0; - acct_cluster_rec_t *cluster = xmalloc(sizeof(acct_cluster_rec_t)); - acct_cluster_cond_t *cluster_cond = - xmalloc(sizeof(acct_cluster_cond_t)); - List cluster_list = NULL; - int cond_set = 0, rec_set = 0; + acct_association_rec_t *assoc = xmalloc(sizeof(acct_association_rec_t)); + acct_association_cond_t *assoc_cond = + xmalloc(sizeof(acct_association_cond_t)); + int cond_set = 0, rec_set = 0, set = 0; List ret_list = NULL; - cluster_cond->cluster_list = list_create(slurm_destroy_char); - - cluster->default_fairshare = -2; - cluster->default_max_cpu_secs_per_job = -2; - cluster->default_max_jobs = -2; - cluster->default_max_nodes_per_job = -2; - cluster->default_max_wall_duration_per_job = -2; + assoc_cond = xmalloc(sizeof(acct_association_cond_t)); + assoc_cond->cluster_list = list_create(slurm_destroy_char); + assoc_cond->acct_list = list_create(NULL); + assoc_cond->fairshare = NO_VAL; + assoc_cond->max_cpu_secs_per_job = NO_VAL; + assoc_cond->max_jobs = NO_VAL; + assoc_cond->max_nodes_per_job = NO_VAL; + assoc_cond->max_wall_duration_per_job = NO_VAL; + + assoc->fairshare = NO_VAL; + assoc->max_cpu_secs_per_job = NO_VAL; + assoc->max_jobs = NO_VAL; + assoc->max_nodes_per_job = NO_VAL; + assoc->max_wall_duration_per_job = NO_VAL; for (i=0; i<argc; i++) { if (strncasecmp (argv[i], "Where", 5) == 0) { i++; - if(_set_cond(&i, argc, argv, cluster_cond, NULL)) + if(_set_cond(&i, argc, argv, + assoc_cond->cluster_list, NULL)) cond_set = 1; } else if (strncasecmp (argv[i], "Set", 3) == 0) { i++; - if(_set_rec(&i, argc, argv, cluster)) + if(_set_rec(&i, argc, argv, assoc)) rec_set = 1; } else { - if(_set_cond(&i, argc, argv, cluster_cond, NULL)) + if(_set_cond(&i, argc, argv, + assoc_cond->cluster_list, NULL)) cond_set = 1; } } if(!rec_set) { printf(" You didn't give me anything to set\n"); - destroy_acct_cluster_rec(cluster); - destroy_acct_cluster_cond(cluster_cond); + destroy_acct_association_rec(assoc); + destroy_acct_association_cond(assoc_cond); return SLURM_ERROR; } else if(!cond_set) { if(!commit_check("You didn't set any conditions with 'WHERE'.\n" "Are you sure you want to continue?")) { printf("Aborted\n"); - destroy_acct_cluster_rec(cluster); - destroy_acct_cluster_cond(cluster_cond); + destroy_acct_association_rec(assoc); + destroy_acct_association_cond(assoc); return SLURM_SUCCESS; } } @@ -503,45 +519,53 @@ extern int sacctmgr_modify_cluster(int argc, char *argv[]) printf(" Setting\n"); if(rec_set) printf(" User Defaults =\n"); - if(cluster->default_fairshare != NO_VAL) - printf(" Fairshare = %u\n", cluster->default_fairshare); - if(cluster->default_max_cpu_secs_per_job != NO_VAL) + if(assoc->fairshare == INFINITE) + printf(" Fairshare = NONE\n"); + else if(assoc->fairshare != NO_VAL) + printf(" Fairshare = %u\n", assoc->fairshare); + + if(assoc->max_cpu_secs_per_job == INFINITE) + printf(" MaxCPUSecs = NONE\n"); + else if(assoc->max_cpu_secs_per_job != NO_VAL) printf(" MaxCPUSecs = %u\n", - cluster->default_max_cpu_secs_per_job); - if(cluster->default_max_jobs != NO_VAL) - printf(" MaxJobs = %u\n", cluster->default_max_jobs); - if(cluster->default_max_nodes_per_job != NO_VAL) + assoc->max_cpu_secs_per_job); + + if(assoc->max_jobs == INFINITE) + printf(" MaxJobs = NONE\n"); + else if(assoc->max_jobs != NO_VAL) + printf(" MaxJobs = %u\n", assoc->max_jobs); + + if(assoc->max_nodes_per_job == INFINITE) + printf(" MaxNodes = NONE\n"); + else if(assoc->max_nodes_per_job != NO_VAL) printf(" MaxNodes = %u\n", - cluster->default_max_nodes_per_job); - if(cluster->default_max_wall_duration_per_job != NO_VAL) { + assoc->max_nodes_per_job); + + if(assoc->max_wall_duration_per_job == INFINITE) + printf(" MaxWall = NONE\n"); + else if(assoc->max_wall_duration_per_job != NO_VAL) { char time_buf[32]; mins2time_str((time_t) - cluster->default_max_wall_duration_per_job, + assoc->max_wall_duration_per_job, time_buf, sizeof(time_buf)); printf(" MaxWall = %s\n", time_buf); } - cluster_list = list_create(destroy_acct_cluster_rec); - list_append(cluster_list, cluster); + list_append(assoc_cond->acct_list, "root"); notice_thread_init(); - ret_list = acct_storage_g_modify_clusters( - db_conn, my_uid, cluster_cond, cluster); - notice_thread_fini(); + ret_list = acct_storage_g_modify_associations( + db_conn, my_uid, assoc_cond, assoc); + if(ret_list && list_count(ret_list)) { char *object = NULL; ListIterator itr = list_iterator_create(ret_list); - printf(" Modifying clusters...\n"); + printf(" Modified cluster defaults for associations...\n"); while((object = list_next(itr))) { printf(" %s\n", object); } list_iterator_destroy(itr); - if(commit_check("Would you like to commit changes?")) { - acct_storage_g_commit(db_conn, 1); - } else { - printf(" Changes Discarded\n"); - acct_storage_g_commit(db_conn, 0); - } + set = 1; } else if(ret_list) { printf(" Nothing modified\n"); } else { @@ -551,9 +575,18 @@ extern int sacctmgr_modify_cluster(int argc, char *argv[]) if(ret_list) list_destroy(ret_list); + notice_thread_fini(); - destroy_acct_cluster_cond(cluster_cond); - destroy_acct_cluster_rec(cluster); + if(set) { + if(commit_check("Would you like to commit changes?")) + acct_storage_g_commit(db_conn, 1); + else { + printf(" Changes Discarded\n"); + acct_storage_g_commit(db_conn, 0); + } + } + destroy_acct_association_cond(assoc_cond); + destroy_acct_association_rec(assoc); return rc; } @@ -568,7 +601,7 @@ extern int sacctmgr_delete_cluster(int argc, char *argv[]) cluster_cond->cluster_list = list_create(slurm_destroy_char); - if(!_set_cond(&i, argc, argv, cluster_cond, NULL)) { + if(!_set_cond(&i, argc, argv, cluster_cond->cluster_list, NULL)) { printf(" No conditions given to remove, not executing.\n"); destroy_acct_cluster_cond(cluster_cond); return SLURM_ERROR; diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index 73f6b23dc31..536081bbec2 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -426,7 +426,7 @@ _process_command (int argc, char *argv[]) argv[0]); } quiet_flag = -1; - } else if (strncasecmp (argv[0], "rollup", 1) == 0) { + } else if (strncasecmp (argv[0], "rollup", 2) == 0) { if (argc > 1) { exit_code = 1; fprintf (stderr, diff --git a/src/sacctmgr/user_functions.c b/src/sacctmgr/user_functions.c index 358e53809aa..5891753a8be 100644 --- a/src/sacctmgr/user_functions.c +++ b/src/sacctmgr/user_functions.c @@ -151,13 +151,10 @@ static int _set_rec(int *start, int argc, char *argv[], a_set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { + if (mins != NO_VAL) { association->max_wall_duration_per_job = (uint32_t) mins; a_set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - association->max_wall_duration_per_job = -1; - a_set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); @@ -319,12 +316,9 @@ extern int sacctmgr_add_user(int argc, char *argv[]) limit_set = 1; } else if (strncasecmp (argv[i], "MaxWall", 4) == 0) { mins = time_str2mins(argv[i]+end); - if (mins >= 0) { + if (mins != NO_VAL) { max_wall_duration_per_job = (uint32_t) mins; limit_set = 1; - } else if (strcmp(argv[i]+end, "-1") == 0) { - max_wall_duration_per_job = -1; - limit_set = 1; } else { printf(" Bad MaxWall time format: %s\n", argv[i]); @@ -606,16 +600,30 @@ no_default: if(limit_set) { printf(" Non Default Settings\n"); - if(fairshare != NO_VAL) + if(fairshare == INFINITE) + printf(" Fairshare = NONE\n"); + else if(fairshare != NO_VAL) printf(" Fairshare = %u\n", fairshare); - if(max_cpu_secs_per_job != NO_VAL) + + if(max_cpu_secs_per_job == INFINITE) + printf(" MaxCPUSecs = NONE\n"); + else if(max_cpu_secs_per_job != NO_VAL) printf(" MaxCPUSecs = %u\n", max_cpu_secs_per_job); - if(max_jobs != NO_VAL) + + if(max_jobs == INFINITE) + printf(" MaxJobs = NONE\n"); + else if(max_jobs != NO_VAL) printf(" MaxJobs = %u\n", max_jobs); - if(max_nodes_per_job != NO_VAL) + + if(max_nodes_per_job == INFINITE) + printf(" MaxNodes = NONE\n"); + else if(max_nodes_per_job != NO_VAL) printf(" MaxNodes = %u\n", max_nodes_per_job); - if(max_wall_duration_per_job != NO_VAL) { + + if(max_wall_duration_per_job == INFINITE) + printf(" MaxWall = NONE\n"); + else if(max_wall_duration_per_job != NO_VAL) { char time_buf[32]; mins2time_str((time_t) max_wall_duration_per_job, time_buf, sizeof(time_buf)); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index cb762c9d836..0e8f00bad41 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -377,6 +377,7 @@ int main(int argc, char *argv[]) * information to Gold or SlurmDBD, create * a file called "/tmp/slurm_accounting_first" to * capture node initialization information */ + _accounting_mark_all_nodes_down("cold-start"); unlink("/tmp/slurm_accounting_first"); } @@ -928,6 +929,12 @@ static int _accounting_mark_all_nodes_down(char *reason) } xfree(state_file); + if((rc = acct_storage_g_flush_jobs_on_cluster(acct_db_conn, + slurmctld_cluster_name, + event_time)) + == SLURM_ERROR) + return rc; + node_ptr = node_record_table_ptr; for (i = 0; i < node_record_count; i++, node_ptr++) { if (node_ptr->name == '\0') diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 85cbb7be008..4174b656787 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -5296,7 +5296,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, { uint32_t time_limit; - log_assoc_rec(assoc_ptr); + //log_assoc_rec(assoc_ptr); if ((assoc_ptr->max_wall_duration_per_job != NO_VAL) && (assoc_ptr->max_wall_duration_per_job != INFINITE)) { time_limit = assoc_ptr->max_wall_duration_per_job; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index a99a99030a1..834ecc65ad6 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1638,7 +1638,10 @@ extern int validate_nodes_via_front_end( slurm_node_registration_status_msg_t *reg_msg) { int error_code = 0, i, jobs_on_node; - bool updated_job = false, failure_logged = false; + bool updated_job = false; +#ifdef HAVE_BG + bool failure_logged = false; +#endif struct job_record *job_ptr; struct config_record *config_ptr; struct node_record *node_ptr; diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index f2b6627b012..945732abb7e 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -66,6 +66,8 @@ static int _get_jobs(void *db_conn, Buf in_buffer, Buf *out_buffer); static int _get_usage(uint16_t type, void *db_conn, Buf in_buffer, Buf *out_buffer); static int _get_users(void *db_conn, Buf in_buffer, Buf *out_buffer); +static int _flush_jobs(void *db_conn, + Buf in_buffer, Buf *out_buffer, uint32_t *uid); static void *_init_conn(Buf in_buffer, Buf *out_buffer, uint32_t *uid); static int _fini_conn(void **db_conn, Buf in_buffer, Buf *out_buffer); static int _job_complete(void *db_conn, @@ -177,6 +179,9 @@ proc_req(void **db_conn, slurm_fd orig_fd, case DBD_GET_USERS: rc = _get_users(*db_conn, in_buffer, out_buffer); break; + case DBD_FLUSH_JOBS: + rc = _flush_jobs(*db_conn, in_buffer, out_buffer, uid); + break; case DBD_INIT: if (first) (*db_conn) = _init_conn( @@ -786,6 +791,39 @@ static int _get_users(void *db_conn, Buf in_buffer, Buf *out_buffer) return SLURM_SUCCESS; } +static int _flush_jobs(void *db_conn, + Buf in_buffer, Buf *out_buffer, uint32_t *uid) +{ + dbd_cluster_procs_msg_t *cluster_procs_msg = NULL; + int rc = SLURM_SUCCESS; + char *comment = NULL; + + if (*uid != slurmdbd_conf->slurm_user_id) { + comment = "DBD_FLUSH_JOBS message from invalid uid"; + error("DBD_FLUSH_JOBS message from invalid uid %u", *uid); + rc = ESLURM_ACCESS_DENIED; + goto end_it; + } + if (slurmdbd_unpack_cluster_procs_msg(&cluster_procs_msg, in_buffer) != + SLURM_SUCCESS) { + comment = "Failed to unpack DBD_FLUSH_JOBS message"; + error("%s", comment); + rc = SLURM_ERROR; + goto end_it; + } + debug2("DBD_FLUSH_JOBS: called for %s", + cluster_procs_msg->cluster_name); + + rc = acct_storage_g_flush_jobs_on_cluster( + db_conn, + cluster_procs_msg->cluster_name, + cluster_procs_msg->event_time); +end_it: + slurmdbd_free_cluster_procs_msg(cluster_procs_msg); + *out_buffer = make_dbd_rc_msg(rc, comment, DBD_FLUSH_JOBS); + return rc; +} + static void *_init_conn(Buf in_buffer, Buf *out_buffer, uint32_t *uid) { dbd_init_msg_t *init_msg = NULL; @@ -1085,7 +1123,7 @@ static int _modify_assocs(void *db_conn, get_msg->cond, get_msg->rec); slurmdbd_free_modify_msg(DBD_MODIFY_ASSOCS, get_msg); - *out_buffer = init_buf(1024); + *out_buffer = init_buf(1024); pack16((uint16_t) DBD_GOT_LIST, *out_buffer); slurmdbd_pack_list_msg(DBD_GOT_LIST, &list_msg, *out_buffer); if(list_msg.my_list) @@ -1128,7 +1166,7 @@ static int _modify_clusters(void *db_conn, get_msg->cond, get_msg->rec); slurmdbd_free_modify_msg(DBD_MODIFY_CLUSTERS, get_msg); - *out_buffer = init_buf(1024); + *out_buffer = init_buf(1024); pack16((uint16_t) DBD_GOT_LIST, *out_buffer); slurmdbd_pack_list_msg(DBD_GOT_LIST, &list_msg, *out_buffer); if(list_msg.my_list) -- GitLab