diff --git a/NEWS b/NEWS index 2fbd510a2778546efd65d96d57d2ab73b61fd5c2..16985c5f6816cb3d48ca9b731b0baa5695b94c81 100644 --- a/NEWS +++ b/NEWS @@ -31,6 +31,8 @@ documents those changes that are of interest to users and admins. HostFormat=2 in SLURM's wiki.conf for improved performance. -- NOTE: We needed to change an RPC from version 1.3.0. You must upgrade all nodes in a cluster from v1.3.0 to v1.3.1 at the same time. + -- Postgres plugin will work from job accounting, not for + association management yet. * Changes in SLURM 1.3.1 ======================== diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 4217efbdc69797905dad3423383f7d9e43a73d93..954e66d24b8d86477d5c723228d9896751a8e5bb 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -78,6 +78,8 @@ errors generated?</a></li> to log job step information at the appropriate level?</li> <li><a href="#rpm">Why isn't the auth_none.so (or other file) in a SLURM RPM?</li> +<li><a href="#slurmdbd">Why should I use the slurmdbd instead of the +regular database plugins?</li> </ol> <h2>For Users</h2> @@ -911,6 +913,24 @@ add <i>--with auth_none</i> on the rpmbuild command line or add <i>%_with_auth_none</i> to your ~/rpmmacros file. See the file slurm.spec in the SLURM distribution for a list of other options. +<p><a name="slurmdbd"><b>28. Why should I use the slurmdbd instead of the +regular database plugins?</b><br> +While the normal storage plugins will work fine without the added +layer of the slurmdbd there are some great benifits to using the +slurmdbd. + +1. Added security. Using the slurmdbd you can have an authenticated + connection to the database. +2. Off loading processing from the controller. With the slurmdbd there is no + slow down to the controller due to a slow or overloaded database. +3. Keeping enterprise wide accounting from all slurm clusters in one database. + The slurmdbd is multi-threaded and designed to handle all the + accounting for the entire enterprise. +4. With the new database plugins 1.3+ you can query with sacct + accounting stats from any node slurm is installed on. With the + slurmdbd you can also query any cluster using the slurmdbd from any + other cluster's nodes. + <p class="footer"><a href="#top">top</a></p> <p style="text-align:center;">Last modified 1 May 2008</p> diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index e3a81d69b0e4ea708e073fb644daacaa6daabd21..ae6e12d84ded2d6ec44c662cc730aba4c09e4765 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -79,6 +79,9 @@ static int _get_local_association_list(void *db_conn, int enforce) list_destroy(assoc_q.cluster_list); if(!local_association_list) { + /* create list so we don't keep calling this if there + isn't anything there */ + local_association_list = list_create(NULL); slurm_mutex_unlock(&local_association_lock); if(enforce) { error("_get_local_association_list: " @@ -167,10 +170,10 @@ extern int assoc_mgr_fill_in_assoc(void *db_conn, acct_association_rec_t *assoc, acct_association_rec_t * found_assoc = NULL; acct_association_rec_t * ret_assoc = NULL; - if(!local_association_list) + if(!local_association_list) { if(_get_local_association_list(db_conn, enforce) == SLURM_ERROR) return SLURM_ERROR; - + } if((!local_association_list || !list_count(local_association_list)) && !enforce) return SLURM_SUCCESS; diff --git a/src/common/jobacct_common.h b/src/common/jobacct_common.h index 56f42cb292dab4c403f895246534e3cce053f93f..e9e7089d53bd630da49c83c2b572e15c439e6f28 100644 --- a/src/common/jobacct_common.h +++ b/src/common/jobacct_common.h @@ -101,6 +101,7 @@ typedef struct { int opt_purge; /* --purge */ int opt_total; /* --total */ int opt_uid; /* --uid (-1=wildcard, 0=root) */ + int opt_uid_set; int opt_verbose; /* --verbose */ long opt_expire; /* --expire= */ char *opt_expire_timespec; /* --expire= */ diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index 510ce7328442cc0c6ca9fefea8d146ce1f0fa684..7691325414f6602b2c4ea5fac5794d0ad5f0d8db 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -1753,6 +1753,7 @@ slurmdbd_pack_job_start_msg(dbd_job_start_msg_t *msg, Buf buffer) pack32(msg->alloc_cpus, buffer); pack32(msg->assoc_id, buffer); packstr(msg->block_id, buffer); + pack32(msg->db_index, buffer); pack_time(msg->eligible_time, buffer); pack32(msg->gid, buffer); pack32(msg->job_id, buffer); @@ -1764,6 +1765,7 @@ slurmdbd_pack_job_start_msg(dbd_job_start_msg_t *msg, Buf buffer) pack32(msg->req_cpus, buffer); pack_time(msg->start_time, buffer); pack_time(msg->submit_time, buffer); + pack32(msg->uid, buffer); } int inline @@ -1775,6 +1777,7 @@ slurmdbd_unpack_job_start_msg(dbd_job_start_msg_t **msg, Buf buffer) safe_unpack32(&msg_ptr->alloc_cpus, buffer); safe_unpack32(&msg_ptr->assoc_id, buffer); safe_unpackstr_xmalloc(&msg_ptr->block_id, &uint32_tmp, buffer); + safe_unpack32(&msg_ptr->db_index, buffer); safe_unpack_time(&msg_ptr->eligible_time, buffer); safe_unpack32(&msg_ptr->gid, buffer); safe_unpack32(&msg_ptr->job_id, buffer); @@ -1786,6 +1789,7 @@ slurmdbd_unpack_job_start_msg(dbd_job_start_msg_t **msg, Buf buffer) safe_unpack32(&msg_ptr->req_cpus, buffer); safe_unpack_time(&msg_ptr->start_time, buffer); safe_unpack_time(&msg_ptr->submit_time, buffer); + safe_unpack32(&msg_ptr->uid, buffer); return SLURM_SUCCESS; unpack_error: diff --git a/src/common/slurmdbd_defs.h b/src/common/slurmdbd_defs.h index 97e9cbd04dd340f6145852a000e9eef775e7a051..d3d7401a1141d209364b79cdc173bd2328f69d59 100644 --- a/src/common/slurmdbd_defs.h +++ b/src/common/slurmdbd_defs.h @@ -188,6 +188,7 @@ typedef struct dbd_job_start_msg { uint32_t alloc_cpus; /* count of allocated processors */ uint32_t assoc_id; /* accounting association id */ char * block_id; /* Bluegene block id */ + uint32_t db_index; /* index into the db for this job */ time_t eligible_time; /* time job becomes eligible to run */ uint32_t gid; /* group ID */ uint32_t job_id; /* job ID */ @@ -199,6 +200,7 @@ typedef struct dbd_job_start_msg { uint32_t req_cpus; /* count of req processors */ time_t start_time; /* job start time */ time_t submit_time; /* job submit time */ + uint32_t uid; /* user ID if associations are being used */ } dbd_job_start_msg_t; typedef struct dbd_job_start_rc_msg { diff --git a/src/database/mysql_common.c b/src/database/mysql_common.c index 3d7db2c8bacf07b0525b8bed16be35cf74033396..17cc66c89c101919f5bd4c6aef81c8ac507fb0c7 100644 --- a/src/database/mysql_common.c +++ b/src/database/mysql_common.c @@ -132,8 +132,7 @@ static int _create_db(char *db_name, mysql_db_info_t *db_info) mysql_errno(mysql_db), mysql_error(mysql_db), create_line); } - mysql_close(mysql_db); - mysql_server_end(); + mysql_close_db_connection(&mysql_db); } else { info("Connection failed to host = %s " "user = %s pass = %s port = %u", @@ -201,16 +200,29 @@ extern int mysql_get_db_connection(MYSQL **mysql_db, char *db_name, extern int mysql_close_db_connection(MYSQL **mysql_db) { if(mysql_db && *mysql_db) { + if(mysql_thread_safe()) + mysql_thread_end(); mysql_close(*mysql_db); - /* leave as server instead of library since this is - * backwards compatible */ - mysql_server_end(); *mysql_db = NULL; } return SLURM_SUCCESS; } +extern int mysql_cleanup() +{ + debug3("starting mysql cleaning up"); + +#ifdef mysql_library_end + mysql_library_end(); +#else + mysql_server_end(); +#endif + + debug3("finished mysql cleaning up"); + return SLURM_SUCCESS; +} + extern int mysql_db_query(MYSQL *mysql_db, char *query) { if(!mysql_db) diff --git a/src/database/mysql_common.h b/src/database/mysql_common.h index 8e5abf894cc2b665bdd86e354d3fd27cba6b3371..18b7787785696c61c66f80838f7861be65bbad5f 100644 --- a/src/database/mysql_common.h +++ b/src/database/mysql_common.h @@ -79,6 +79,7 @@ extern int *destroy_mysql_db_info(mysql_db_info_t *db_info); extern int mysql_get_db_connection(MYSQL **mysql_db, char *db_name, mysql_db_info_t *db_info); extern int mysql_close_db_connection(MYSQL **mysql_db); +extern int mysql_cleanup(); extern int mysql_db_query(MYSQL *mysql_db, char *query); extern int mysql_db_commit(MYSQL *mysql_db); extern int mysql_db_rollback(MYSQL *mysql_db); diff --git a/src/database/pgsql_common.c b/src/database/pgsql_common.c index 31af50e1caf1d72b48adf8c7027f76bb8d1f957f..1d07c1a92c7e8df1592c935da5dfb40f0ad2c1fc 100644 --- a/src/database/pgsql_common.c +++ b/src/database/pgsql_common.c @@ -56,10 +56,10 @@ extern int *destroy_pgsql_db_info(pgsql_db_info_t *db_info) return SLURM_SUCCESS; } -extern int pgsql_create_db(PGconn *pgsql_db, char *db_name, - pgsql_db_info_t *db_info) +extern int _create_db(char *db_name, pgsql_db_info_t *db_info) { char create_line[50]; + PGconn *pgsql_db = NULL; char *connect_line = xstrdup_printf("dbname = 'postgres'" " host = '%s'" " port = '%u'" @@ -82,12 +82,14 @@ extern int pgsql_create_db(PGconn *pgsql_db, char *db_name, PQresultStatus(result), PQerrorMessage(pgsql_db), create_line); } PQclear(result); + pgsql_close_db_connection(&pgsql_db); } else { info("Connection failed to %s", connect_line); fatal("Status was: %d %s", PQstatus(pgsql_db), PQerrorMessage(pgsql_db)); } xfree(connect_line); + return SLURM_SUCCESS; } @@ -120,15 +122,10 @@ extern int pgsql_get_db_connection(PGconn **pgsql_db, char *db_name, } info("Database %s not created. Creating", db_name); - PQfinish(*pgsql_db); - pgsql_create_db(*pgsql_db, db_name, db_info); + pgsql_close_db_connection(pgsql_db); + _create_db(db_name, db_info); } else { storage_init = true; - /* debug2("connected to %s", db_name); */ -/* if(rollback || rollback_started) { */ -/* rollback_started = 1; */ -/* PQexec(*pgsql_db, "BEGIN WORK"); */ -/* } */ } } xfree(connect_line); diff --git a/src/database/pgsql_common.h b/src/database/pgsql_common.h index b53fbd2fdb4f49f505491e36ad30d338f7216689..cc48e8c5fad7acbfa3ee19063278007d118ecae9 100644 --- a/src/database/pgsql_common.h +++ b/src/database/pgsql_common.h @@ -75,9 +75,6 @@ extern pthread_mutex_t pgsql_lock; extern int *destroy_pgsql_db_info(pgsql_db_info_t *db_info); -extern int pgsql_create_db(PGconn *pgsql_db, char *db_name, - pgsql_db_info_t *db_info); - extern int pgsql_get_db_connection(PGconn **pgsql_db, char *db_name, pgsql_db_info_t *db_info); diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index ac6f793126f6a1982dc77611919c69c530138b1d..9c6a565cbdfe107fa8d984ce5f5cf1fa571fd549 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -749,6 +749,7 @@ static int _mysql_acct_check_tables(MYSQL *acct_mysql_db) { "id", "int not null auto_increment" }, { "jobid", "mediumint unsigned not null" }, { "associd", "mediumint unsigned not null" }, + { "uid", "smallint unsigned not null" }, { "gid", "smallint unsigned not null" }, { "partition", "tinytext not null" }, { "blockid", "tinytext" }, @@ -985,7 +986,7 @@ extern int fini ( void ) #ifdef HAVE_MYSQL destroy_mysql_db_info(mysql_db_info); xfree(mysql_db_name); - + mysql_cleanup(); return SLURM_SUCCESS; #else return SLURM_ERROR; @@ -1024,7 +1025,6 @@ extern int acct_storage_p_close_connection(mysql_conn_t **mysql_conn) return SLURM_SUCCESS; acct_storage_p_commit((*mysql_conn), 0); - mysql_close_db_connection(&(*mysql_conn)->acct_mysql_db); list_destroy((*mysql_conn)->update_list); xfree((*mysql_conn)); @@ -1704,8 +1704,7 @@ extern int acct_storage_p_add_associations(mysql_conn_t *mysql_conn, xstrcat(tmp_char, ", "); xstrcat(tmp_char, massoc_req_inx[i]); } - - + xstrfmtcat(query, "select distinct %s from %s %s FOR UPDATE;", tmp_char, assoc_table, update); @@ -4139,38 +4138,60 @@ extern int jobacct_storage_p_job_start(mysql_conn_t *mysql_conn, job_ptr->requid = -1; /* force to -1 for sacct to know this * hasn't been set yet */ - query = xstrdup_printf( - "insert into %s " - "(jobid, account, associd, gid, partition, blockid, " - "eligible, submit, start, name, track_steps, " - "state, priority, req_cpus, alloc_cpus, nodelist) " - "values (%u, '%s', %u, %u, '%s', '%s', " - "%d, %d, %d, '%s', %u, " - "%u, %u, %u, %u, '%s') " - "on duplicate key update id=LAST_INSERT_ID(id)", - job_table, job_ptr->job_id, job_ptr->account, job_ptr->assoc_id, - job_ptr->group_id, job_ptr->partition, block_id, - (int)job_ptr->details->begin_time, - (int)job_ptr->details->submit_time, (int)job_ptr->start_time, - jname, track_steps, job_ptr->job_state & (~JOB_COMPLETING), - priority, job_ptr->num_procs, job_ptr->total_procs, nodes); + if(!job_ptr->db_index) { + query = xstrdup_printf( + "insert into %s " + "(jobid, account, associd, uid, gid, partition, " + "blockid, eligible, submit, start, name, track_steps, " + "state, priority, req_cpus, alloc_cpus, nodelist) " + "values (%u, '%s', %u, %u, %u, '%s', '%s', " + "%d, %d, %d, '%s', %u, " + "%u, %u, %u, %u, '%s')", + job_table, job_ptr->job_id, job_ptr->account, + job_ptr->assoc_id, + job_ptr->user_id, job_ptr->group_id, + job_ptr->partition, block_id, + (int)job_ptr->details->begin_time, + (int)job_ptr->details->submit_time, + (int)job_ptr->start_time, + jname, track_steps, + job_ptr->job_state & (~JOB_COMPLETING), + priority, job_ptr->num_procs, + job_ptr->total_procs, nodes); + + try_again: + if(!(job_ptr->db_index = mysql_insert_ret_id( + mysql_conn->acct_mysql_db, query))) { + if(!reinit) { + error("It looks like the storage has gone " + "away trying to reconnect"); + mysql_close_db_connection( + &mysql_conn->acct_mysql_db); + mysql_get_db_connection( + &mysql_conn->acct_mysql_db, + mysql_db_name, mysql_db_info); + reinit = 1; + goto try_again; + } else + rc = SLURM_ERROR; + } + } else { + query = xstrdup_printf( + "update %s set partition='%s', blockid='%s', start=%d, " + "name='%s', state=%u, alloc_cpus=%u, nodelist='%s', " + "account='%s' where id=%d", + job_table, job_ptr->partition, block_id, + (int)job_ptr->start_time, + jname, + job_ptr->job_state & (~JOB_COMPLETING), + job_ptr->total_procs, nodes, + job_ptr->account, job_ptr->db_index); + rc = mysql_db_query(mysql_conn->acct_mysql_db, query); + } xfree(block_id); xfree(jname); -try_again: - if(!(job_ptr->db_index = mysql_insert_ret_id(mysql_conn->acct_mysql_db, query))) { - if(!reinit) { - error("It looks like the storage has gone " - "away trying to reconnect"); - mysql_close_db_connection(&mysql_conn->acct_mysql_db); - mysql_get_db_connection(&mysql_conn->acct_mysql_db, - mysql_db_name, mysql_db_info); - reinit = 1; - goto try_again; - } else - rc = SLURM_ERROR; - } xfree(query); return rc; diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c index 8fdb2627ccb44ddba3c3498fa65f2b929eb66dfd..8895e89adb3176374eacef584dedd434071ac726 100644 --- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c @@ -78,9 +78,11 @@ extern List mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, "t1.id", "t1.jobid", "t1.associd", + "t1.uid", "t1.gid", "t1.partition", "t1.blockid", + "t1.account", "t1.eligible", "t1.submit", "t1.start", @@ -137,9 +139,11 @@ extern List mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, JOB_REQ_ID, JOB_REQ_JOBID, JOB_REQ_ASSOCID, + JOB_REQ_UID, JOB_REQ_GID, JOB_REQ_PARTITION, JOB_REQ_BLOCKID, + JOB_REQ_ACCOUNT, JOB_REQ_ELIGIBLE, JOB_REQ_SUBMIT, JOB_REQ_START, @@ -276,8 +280,13 @@ extern List mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, if(account_rec.user) job->user = xstrdup(account_rec.user); + else + job->uid = atoi(row[JOB_REQ_UID]); if(account_rec.acct) job->account = xstrdup(account_rec.acct); + else + job->account = xstrdup(row[JOB_REQ_ACCOUNT]); + job->blockid = xstrdup(row[JOB_REQ_BLOCKID]); job->eligible = atoi(row[JOB_REQ_ELIGIBLE]); diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.h b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.h index 69a0eae140d16cd5de196716a25ceae7f11faaa3..e9def5417a691a45d6ddf564e04a12befb16e008 100644 --- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.h +++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.h @@ -69,9 +69,6 @@ typedef struct { extern char *job_table; extern char *step_table; -extern int acct_storage_p_get_assoc_id(mysql_conn_t *mysql_conn, - acct_association_rec_t *assoc); - extern List mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, List selected_steps, List selected_parts, diff --git a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c index 607a343ff5442731a9798818ca0f7eafee886cf7..49ce12d62af6aca4c89d55a86b5c83a455304d33 100644 --- a/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c +++ b/src/plugins/accounting_storage/pgsql/accounting_storage_pgsql.c @@ -136,6 +136,8 @@ static pgsql_db_info_t *_pgsql_acct_create_db_info() if(!db_info->port) db_info->port = 5432; db_info->host = slurm_get_accounting_storage_host(); + if(!db_info->host) + db_info->host = xstrdup("localhost"); db_info->user = slurm_get_accounting_storage_user(); db_info->pass = slurm_get_accounting_storage_pass(); return db_info; @@ -198,7 +200,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, { "mod_time", "bigint default 0" }, { "deleted", "smallint default 0" }, { "name", "text not null" }, - { "control_host", "tinytext not null" }, + { "control_host", "text not null" }, { "control_port", "int not null" }, { NULL, NULL} }; @@ -221,7 +223,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, { "node_name", "text default '' not null" }, { "cluster", "text not null" }, { "cpu_count", "int not null" }, - { "period_start", "bigint unsigned not null" }, + { "period_start", "bigint not null" }, { "period_end", "bigint default 0 not null" }, { "reason", "text not null" }, { NULL, NULL} @@ -231,9 +233,11 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, { "id", "serial" }, { "jobid ", "integer not null" }, { "associd", "bigint not null" }, - { "gid", "smallint unsigned not null" }, + { "uid", "smallint not null" }, + { "gid", "smallint not null" }, { "partition", "text not null" }, { "blockid", "text" }, + { "account", "text" }, { "submit", "bigint not null" }, { "eligible", "bigint default 0 not null" }, { "start", "bigint default 0 not null" }, @@ -256,7 +260,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, { "id", "int not null" }, { "stepid", "smallint not null" }, { "start", "bigint default 0 not null" }, - { "end", "bigint default 0 not null" }, + { "endtime", "bigint default 0 not null" }, { "suspended", "bigint default 0 not null" }, { "name", "text not null" }, { "nodelist", "text not null" }, @@ -300,7 +304,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, storage_field_t user_table_fields[] = { { "creation_time", "bigint not null" }, { "mod_time", "bigint default 0" }, - { "deleted", "bool default 0" }, + { "deleted", "smallint default 0" }, { "name", "text not null" }, { "default_acct", "text not null" }, { "qos", "smallint default 1 not null" }, @@ -380,8 +384,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(pgsql_db_create_table(acct_pgsql_db, acct_coord_table, acct_coord_table_fields, - ", primary key (acct(20), " - "user_name(20)))") + ", unique (acct, user_name))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -394,7 +397,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!acct_found) { if(pgsql_db_create_table(acct_pgsql_db, acct_table, acct_table_fields, - ", primary key (name(20)))") + ", unique (name))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -409,7 +412,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, assoc_day_table, assoc_usage_table_fields, - ", primary key (associd, period_start))") + ", unique (associd, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -424,7 +427,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, assoc_hour_table, assoc_usage_table_fields, - ", primary key (associd, period_start))") + ", unique (associd, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -439,7 +442,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, assoc_month_table, assoc_usage_table_fields, - ", primary key (associd, period_start))") + ", unique (associd, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -453,9 +456,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(pgsql_db_create_table( acct_pgsql_db, assoc_table, assoc_table_fields, - ", primary key (id), " - "unique index (user_name(20), acct(20), " - "cluster(20), partition(20)))") + ", unique (user_name, acct, cluster, partition))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -470,7 +471,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, cluster_day_table, cluster_usage_table_fields, - ", primary key (cluster(20), period_start))") + ", unique (cluster, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -485,7 +486,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, cluster_hour_table, cluster_usage_table_fields, - ", primary key (cluster(20), period_start))") + ", unique (cluster, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -500,7 +501,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, acct_pgsql_db, cluster_month_table, cluster_usage_table_fields, - ", primary key (cluster(20), period_start))") + ", unique (cluster, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -513,7 +514,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!cluster_found) { if(pgsql_db_create_table(acct_pgsql_db, cluster_table, cluster_table_fields, - ", primary key (name(20)))") + ", unique (name))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -526,8 +527,8 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!event_found) { if(pgsql_db_create_table(acct_pgsql_db, event_table, event_table_fields, - ", primary key (node_name(20), " - "cluster(20), period_start))") + ", unique (node_name, " + "cluster, period_start))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -540,8 +541,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!job_found) { if(pgsql_db_create_table(acct_pgsql_db, job_table, job_table_fields, - ", primary key (id), unique index " - "(jobid, associd, submit))") + ", unique (jobid, associd, submit))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -554,7 +554,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!step_found) { if(pgsql_db_create_table(acct_pgsql_db, step_table, step_table_fields, - ", primary key (id, stepid))") + ", unique (id, stepid))") == SLURM_ERROR) return SLURM_ERROR; @@ -568,7 +568,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!txn_found) { if(pgsql_db_create_table(acct_pgsql_db, txn_table, txn_table_fields, - ", primary key (id))") + ", unique (id))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -581,7 +581,7 @@ static int _pgsql_acct_check_tables(PGconn *acct_pgsql_db, if(!user_found) { if(pgsql_db_create_table(acct_pgsql_db, user_table, user_table_fields, - ", primary key (name(20)))") + ", unique (name))") == SLURM_ERROR) return SLURM_ERROR; } else { @@ -646,7 +646,7 @@ extern int init ( void ) pgsql_get_db_connection(&acct_pgsql_db, pgsql_db_name, pgsql_db_info); rc = _pgsql_acct_check_tables(acct_pgsql_db, pgsql_db_info->user); - + pgsql_close_db_connection(&acct_pgsql_db); #endif /* since this can be loaded from many different places only tell us once. */ @@ -690,7 +690,8 @@ extern void *acct_storage_p_get_connection(bool make_agent, bool rollback) extern int acct_storage_p_close_connection(PGconn **acct_pgsql_db) { #ifdef HAVE_PGSQL - pgsql_close_db_connection(acct_pgsql_db); + if(acct_pgsql_db && *acct_pgsql_db) + pgsql_close_db_connection(acct_pgsql_db); return SLURM_SUCCESS; #else @@ -915,7 +916,7 @@ extern int clusteracct_storage_p_cluster_procs(PGconn *acct_pgsql_db, #ifdef HAVE_PGSQL static uint32_t last_procs = -1; char* query; - int rc = SLURM_ERROR; + int rc = SLURM_SUCCESS; PGresult *result = NULL; int got_procs = 0; @@ -944,7 +945,7 @@ extern int clusteracct_storage_p_cluster_procs(PGconn *acct_pgsql_db, /* we only are checking the first one here */ if(!PQntuples(result)) { - debug("We don't have an entry for this machine %s" + debug("We don't have an entry for this machine %s " "most likely a first time running.", cluster); goto add_it; } @@ -1014,7 +1015,7 @@ extern int jobacct_storage_p_job_start(PGconn *acct_pgsql_db, return SLURM_ERROR; } - debug2("pgsql_jobacct_job_start() called"); + debug3("pgsql_jobacct_job_start() called"); priority = (job_ptr->priority == NO_VAL) ? -1L : (long) job_ptr->priority; @@ -1050,39 +1051,59 @@ extern int jobacct_storage_p_job_start(PGconn *acct_pgsql_db, } job_ptr->requid = -1; /* force to -1 for sacct to know this * hasn't been set yet */ - query = xstrdup_printf( - "insert into %s " - "(jobid, associd, gid, partition, blockid, " - "eligible, submit, start, name, track_steps, " - "state, priority, req_cpus, alloc_cpus, nodelist) " - "values (%u, %u, %u, '%s', '%s', " - "%d, %d, %d, '%s', %u, " - "%u, %u, %u, %u, '%s')", - job_table, job_ptr->job_id, job_ptr->assoc_id, - job_ptr->group_id, job_ptr->partition, block_id, - (int)job_ptr->details->begin_time, - (int)job_ptr->details->submit_time, (int)job_ptr->start_time, - jname, track_steps, job_ptr->job_state & (~JOB_COMPLETING), - priority, job_ptr->num_procs, job_ptr->total_procs, nodes); + if(!job_ptr->db_index) { + query = xstrdup_printf( + "insert into %s " + "(jobid, account, associd, uid, gid, partition, " + "blockid, eligible, submit, start, name, track_steps, " + "state, priority, req_cpus, alloc_cpus, nodelist) " + "values (%u, '%s', %u, %u, %u, '%s', '%s', " + "%d, %d, %d, '%s', %u, " + "%u, %u, %u, %u, '%s')", + job_table, job_ptr->job_id, job_ptr->account, + job_ptr->assoc_id, + job_ptr->user_id, job_ptr->group_id, + job_ptr->partition, block_id, + (int)job_ptr->details->begin_time, + (int)job_ptr->details->submit_time, + (int)job_ptr->start_time, + jname, track_steps, + job_ptr->job_state & (~JOB_COMPLETING), + priority, job_ptr->num_procs, + job_ptr->total_procs, nodes); + try_again: + if(!(job_ptr->db_index = pgsql_insert_ret_id(acct_pgsql_db, + "job_table_id_seq", + query))) { + if(!reinit) { + error("It looks like the storage has gone " + "away trying to reconnect"); + pgsql_close_db_connection(&acct_pgsql_db); + pgsql_get_db_connection(&acct_pgsql_db, + pgsql_db_name, + pgsql_db_info); + reinit = 1; + goto try_again; + } else + rc = SLURM_ERROR; + } + } else { + query = xstrdup_printf( + "update %s set partition='%s', blockid='%s', start=%d, " + "name='%s', state=%u, alloc_cpus=%u, nodelist='%s', " + "account='%s' where id=%d", + job_table, job_ptr->partition, block_id, + (int)job_ptr->start_time, + jname, + job_ptr->job_state & (~JOB_COMPLETING), + job_ptr->total_procs, nodes, + job_ptr->account, job_ptr->db_index); + rc = pgsql_db_query(acct_pgsql_db, query); + } xfree(block_id); xfree(jname); -try_again: - if(!(job_ptr->db_index = pgsql_insert_ret_id(acct_pgsql_db, - "index_table_id_seq", - query))) { - if(!reinit) { - error("It looks like the storage has gone " - "away trying to reconnect"); - pgsql_close_db_connection(&acct_pgsql_db); - pgsql_get_db_connection(&acct_pgsql_db, - pgsql_db_name, pgsql_db_info); - reinit = 1; - goto try_again; - } else - rc = SLURM_ERROR; - } xfree(query); return rc; @@ -1114,7 +1135,7 @@ extern int jobacct_storage_p_job_complete(PGconn *acct_pgsql_db, return SLURM_ERROR; } - debug2("pgsql_jobacct_job_complete() called"); + debug3("pgsql_jobacct_job_complete() called"); if (job_ptr->end_time == 0) { debug("pgsql_jobacct: job %u never started", job_ptr->job_id); return SLURM_ERROR; @@ -1133,7 +1154,7 @@ extern int jobacct_storage_p_job_complete(PGconn *acct_pgsql_db, if(job_ptr->db_index == -1) return SLURM_ERROR; } - query = xstrdup_printf("update %s set start=%u, end=%u, state=%d, " + query = xstrdup_printf("update %s set start=%u, endtime=%u, state=%d, " "nodelist='%s', comp_code=%u, " "kill_requid=%u where id=%u", job_table, (int)job_ptr->start_time, @@ -1418,7 +1439,7 @@ extern int jobacct_storage_p_suspend(PGconn *acct_pgsql_db, if(rc != SLURM_ERROR) { snprintf(query, sizeof(query), "update %s set suspended=%u-suspended, " - "state=%d where id=%u and end=0", + "state=%d where id=%u and endtime=0", step_table, (int)job_ptr->suspend_time, job_ptr->job_state, job_ptr->db_index); rc = pgsql_db_query(acct_pgsql_db, query); @@ -1451,7 +1472,7 @@ extern List jobacct_storage_p_get_jobs(PGconn *acct_pgsql_db, job_list = pgsql_jobacct_process_get_jobs(acct_pgsql_db, selected_steps, selected_parts, - params); + params); #endif return job_list; } diff --git a/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.c b/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.c index 2936bec2b53eefd17240d4a88336c85765f35ff9..fedf95216930b826929f49f5533d1c1a87f12c74 100644 --- a/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.c +++ b/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.c @@ -76,13 +76,15 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, "t1.id", "t1.jobid", "t1.associd", + "t1.uid", "t1.gid", "t1.partition", "t1.blockid", + "t1.account", "t1.eligible", "t1.submit", "t1.start", - "t1.end", + "t1.endtime", "t1.suspended", "t1.name", "t1.track_steps", @@ -101,7 +103,7 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, char *step_req_inx[] = { "t1.stepid", "t1.start", - "t1.end", + "t1.endtime", "t1.suspended", "t1.name", "t1.nodelist", @@ -135,13 +137,15 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, JOB_REQ_ID, JOB_REQ_JOBID, JOB_REQ_ASSOCID, + JOB_REQ_UID, JOB_REQ_GID, JOB_REQ_PARTITION, JOB_REQ_BLOCKID, + JOB_REQ_ACCOUNT, JOB_REQ_ELIGIBLE, JOB_REQ_SUBMIT, JOB_REQ_START, - JOB_REQ_END, + JOB_REQ_ENDTIME, JOB_REQ_SUSPENDED, JOB_REQ_NAME, JOB_REQ_TRACKSTEPS, @@ -158,7 +162,7 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, enum { STEP_REQ_STEPID, STEP_REQ_START, - STEP_REQ_END, + STEP_REQ_ENDTIME, STEP_REQ_SUSPENDED, STEP_REQ_NAME, STEP_REQ_NODELIST, @@ -256,19 +260,31 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, JOB_REQ_ALLOC_CPUS)); job->associd = atoi(PQgetvalue(result, i, JOB_REQ_ASSOCID)); account_rec.id = job->associd; - acct_storage_p_get_assoc_id(acct_pgsql_db, &account_rec); + assoc_mgr_fill_in_assoc(acct_pgsql_db, &account_rec, 0); + if(account_rec.cluster) { + if(params->opt_cluster && + strcmp(params->opt_cluster, account_rec.cluster)) { + destroy_jobacct_job_rec(job); + job = NULL; + continue; + } + job->cluster = xstrdup(account_rec.cluster); + } if(account_rec.user) job->user = xstrdup(account_rec.user); + else + job->uid = atoi(PQgetvalue(result, i, JOB_REQ_UID)); if(account_rec.acct) job->account = xstrdup(account_rec.acct); - if(account_rec.cluster) - job->cluster = xstrdup(account_rec.cluster); + else + job->account = xstrdup(PQgetvalue(result, i, + JOB_REQ_ACCOUNT)); job->blockid = xstrdup(PQgetvalue(result, i, JOB_REQ_BLOCKID)); job->eligible = atoi(PQgetvalue(result, i, JOB_REQ_SUBMIT)); job->submit = atoi(PQgetvalue(result, i, JOB_REQ_SUBMIT)); job->start = atoi(PQgetvalue(result, i, JOB_REQ_START)); - job->end = atoi(PQgetvalue(result, i, JOB_REQ_END)); + job->end = atoi(PQgetvalue(result, i, JOB_REQ_ENDTIME)); job->suspended = atoi(PQgetvalue(result, i, JOB_REQ_SUSPENDED)); if(!job->end) { job->elapsed = now - job->start; @@ -368,7 +384,7 @@ extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, step->start = atoi( PQgetvalue(step_result, j, JOB_REQ_START)); step->end = atoi( - PQgetvalue(step_result, j, STEP_REQ_END)); + PQgetvalue(step_result, j, STEP_REQ_ENDTIME)); /* figure this out by start stop */ step->suspended = atoi( PQgetvalue(step_result, j, STEP_REQ_SUSPENDED)); diff --git a/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.h b/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.h index 997c81d40e29f13cd631b0ce30f6ee6edec4e466..255f1a9d78a35770f0e8d18634212f3a4a389e98 100644 --- a/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.h +++ b/src/plugins/accounting_storage/pgsql/pgsql_jobacct_process.h @@ -46,6 +46,7 @@ #include <sys/types.h> #include <pwd.h> #include <stdlib.h> +#include "src/common/assoc_mgr.h" #include "src/common/jobacct_common.h" #include "src/slurmdbd/read_config.h" #include "src/slurmctld/slurmctld.h" @@ -57,9 +58,6 @@ extern char *job_table; extern char *step_table; -extern int acct_storage_p_get_assoc_id(PGconn *acct_pgsql_db, - acct_association_rec_t *assoc); - extern List pgsql_jobacct_process_get_jobs(PGconn *acct_pgsql_db, List selected_steps, List selected_parts, diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index 565c1f120af0b37c7db41bd8837746b6371ce89e..b2dc9ad3d054695b228ab6bcb0f7086a22cd46af 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -889,6 +889,7 @@ extern int jobacct_storage_p_job_start(void *db_conn, #endif req.block_id = block_id; xfree(block_id); + req.db_index = job_ptr->db_index; if (job_ptr->details) req.eligible_time = job_ptr->details->begin_time; req.gid = job_ptr->group_id; @@ -902,6 +903,7 @@ extern int jobacct_storage_p_job_start(void *db_conn, req.start_time = job_ptr->start_time; if (job_ptr->details) req.submit_time = job_ptr->details->submit_time; + req.uid = job_ptr->user_id; msg.msg_type = DBD_JOB_START; msg.data = &req; diff --git a/src/sacct/options.c b/src/sacct/options.c index 0fffef99ecdf5f6e50ae9aa107e924b61bb3c9fb..330b6254884f992ab2e7481879900a90934ba5c6 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -202,7 +202,7 @@ void _help_msg(void) " intermediate steps\n" "-u <uid>, --uid <uid>\n" " Select only jobs submitted by the user with uid <uid>. Only\n" - " root users are allowed to specify a uid other than their own.\n" + " root users are allowed to specify a uid other than their own -1 for all users.\n" "--usage\n" " Pointer to this message.\n" "-v, --verbose\n" @@ -235,6 +235,7 @@ void _init_params() params.opt_purge = 0; /* --purge */ params.opt_total = 0; /* --total */ params.opt_uid = -1; /* --uid (-1=wildcard, 0=root) */ + params.opt_uid_set = 0; params.opt_verbose = 0; /* --verbose */ params.opt_expire_timespec = NULL; /* --expire= */ params.opt_field_list = NULL; /* --fields= */ @@ -292,6 +293,12 @@ int get_data(void) itr = list_iterator_create(jobs); while((job = list_next(itr))) { + if(job->user) { + struct passwd *pw = NULL; + if ((pw=getpwnam(job->user))) + job->uid = pw->pw_uid; + } + if(!list_count(job->steps)) continue; @@ -367,7 +374,8 @@ void parse_command_line(int argc, char **argv) _init_params(); - if ((i=getuid())) /* default to current user unless root*/ + if ((i=getuid())) + /* default to current user unless root*/ params.opt_uid = i; opterr = 1; /* Let getopt report problems to the user */ @@ -547,7 +555,7 @@ void parse_command_line(int argc, char **argv) break; case 'u': - if (isdigit((int) *optarg)) + if (isdigit((int) *optarg) || atoi(optarg) == -1) params.opt_uid = atoi(optarg); else { struct passwd *pwd; diff --git a/src/sacct/print.c b/src/sacct/print.c index 2bd91eedaecd8f8697e7418a68830334095ff0d0..61a70f8e4f8ca7284923fc84ef62475803c8c533 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -850,7 +850,7 @@ void print_uid(type_t type, void *object) jobacct_job_rec_t *job = (jobacct_job_rec_t *)object; jobcomp_job_rec_t *jobcomp = (jobcomp_job_rec_t *)object; int32_t uid = -1; - struct passwd *passwd_ptr = NULL; + struct passwd *pw = NULL; switch(type) { case HEADLINE: @@ -861,9 +861,8 @@ void print_uid(type_t type, void *object) break; case JOB: if(job->user) { - getpwnam(job->user); - if(passwd_ptr) - uid = passwd_ptr->pw_uid; + if ((pw=getpwnam(job->user))) + uid = pw->pw_uid; } else uid = job->uid; break; diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index 88d3eb229e1443cd40bd6940cadcad68d5915174..2b6f47a17962912bf8bfc51a30cb13703bc4863f 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -196,6 +196,7 @@ main (int argc, char *argv[]) } acct_storage_g_close_connection(&db_conn); + slurm_acct_storage_fini(); printf("\n"); exit(exit_code); } diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 93ec25feca858dfd49a97c8f69562b8022a73bac..79be14fabb9dfbe22bb1355624af5beb33caf981 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -459,7 +459,7 @@ int main(int argc, char *argv[]) if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) verbose("Unable to remove pidfile '%s': %m", slurmctld_conf.slurmctld_pidfile); - + #ifdef MEMORY_LEAK_DEBUG /* This should purge all allocated memory, *\ \* Anything left over represents a leak. */ @@ -492,6 +492,7 @@ int main(int argc, char *argv[]) checkpoint_fini(); slurm_auth_fini(); switch_fini(); + assoc_mgr_fini(); /* purge remaining data structures */ slurm_cred_ctx_destroy(slurmctld_config.cred_ctx); @@ -510,13 +511,14 @@ int main(int argc, char *argv[]) sleep(1); } #endif + xfree(slurmctld_cluster_name); if (cnt) { info("Slurmctld shutdown completing with %d active agent " "threads\n\n", cnt); } log_fini(); - + if (dump_core) abort(); else diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index 1d9dcf79a19fbb4edfe33110a50896a174ea0cb5..37026bfdc3f721b9c84bb5621711c507e7eed827 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -926,7 +926,9 @@ static int _job_start(void *db_conn, job.total_procs = job_start_msg->alloc_cpus; job.assoc_id = job_start_msg->assoc_id; job.comment = job_start_msg->block_id; + job.db_index = job_start_msg->db_index; details.begin_time = job_start_msg->eligible_time; + job.user_id = job_start_msg->uid; job.group_id = job_start_msg->gid; job.job_id = job_start_msg->job_id; job.job_state = job_start_msg->job_state; @@ -940,9 +942,14 @@ static int _job_start(void *db_conn, job.details = &details; - debug2("DBD_JOB_START: ID:%u NAME:%s", - job_start_msg->job_id, job_start_msg->name); - + if(job.db_index) { + debug2("DBD_JOB_START: START CALL ID:%u NAME:%s INX:%u", + job_start_msg->job_id, job_start_msg->name, + job.db_index); + } else { + debug2("DBD_JOB_START: ELIGIBLE CALL ID:%u NAME:%s", + job_start_msg->job_id, job_start_msg->name); + } job_start_rc_msg.return_code = jobacct_storage_g_job_start(db_conn, &job); job_start_rc_msg.db_index = job.db_index; @@ -1291,9 +1298,11 @@ static int _register_ctld(void *db_conn, slurm_fd orig_fd, if(!list_msg.my_list || !list_count(list_msg.my_list)) { comment = "This cluster hasn't been added to accounting yet"; rc = SLURM_ERROR; - } else { + } + + if(list_msg.my_list) list_destroy(list_msg.my_list); - } + list_destroy(cluster_q.cluster_list); /* diff --git a/src/slurmdbd/rpc_mgr.c b/src/slurmdbd/rpc_mgr.c index 61490ad60e232ab3d1c0aaa7ca3e63ef64f2f0fe..e58f503cc4a8bed20eec2f2b75a86ff9ffb75123 100644 --- a/src/slurmdbd/rpc_mgr.c +++ b/src/slurmdbd/rpc_mgr.c @@ -238,7 +238,6 @@ static void * _service_connection(void *arg) } acct_storage_g_close_connection(&db_conn); - if (slurm_close_accepted_conn(conn->newsockfd) < 0) error("close(%d): %m", conn->newsockfd); else