diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 446bed2c7f639eb39f3d7481de2ab5828683c769..cb6e5eb087deb9b14d2b03cb426552982e39f289 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -877,7 +877,7 @@ Maximum number of jobs this user can run. \fIMaxNodesPerJob=\fP Maximum number of nodes per job this user can run. .TP -\fIMaxProcSecondsPerJob= +\fIMaxProcSecondsPerJob=\fP Maximum cpu seconds this user can run per job. .TP \fIMaxWallDurationPerJob=\fP @@ -887,6 +887,64 @@ Maximum time (not related to job size) this user can run. Comma separated list of Quality of Service names (Defined in sacctmgr). .RE +.SH "ARCHIVE FUNCTIONALITY" +Sacctmgr has the capability to archive to a flatfile and or load that +data if needed later. The archiving is usually done by the slurmdbd +and it is highly recommended you only do it through sacctmgr if you +completely understand what you are doing. For slurmdbd options see +"man slurmdbd" for more information. +Loading data into the database can be done from these files to either +view old data or regenerate rolled up data. + +These are the options for both dump and load of archive information. + +archive dump + +.TP +\fIDirectory=\fP +Directory to store the archive data. +.TP +\fIEvents\fP +Archive Events. If not specified and PurgeEventMonths is set +all event data removed will be lost permanently. +.TP +\fIJobs\fP +Archive Jobs. If not specified and PurgeJobMonths is set +all job data removed will be lost permanently. +.TP +\fIPurgeEventMonths=\fP +Purge cluster event records older than time stated in months. +.TP +\fIPurgeJobMonths=\fP +Purge job records older than time stated in months. +.TP +\fIPurgeStepMonths=\fP +Purge step records older than time stated in months. +.TP +\fIPurgeSuspendMonths=\fP +Purge job suspend records older than time stated in months. +.TP +\fIScript=\fP +Run this script instead of the generic form of archive to flat files. +.TP +\fISteps\fP +Archive Steps. If not specified and PurgeStepMonths is set +all step data removed will be lost permanently. +.TP +\fISuspend\fP +Archive Suspend Data. If not specified and PurgeSuspendMonths is set +all suspend data removed will be lost permanently. +.RE + +archive load +.TP +\fIFile=\fP +File to load into database. +.TP +\fIInsert=\fP +SQL to insert directly into the database. This should be used very +cautiously since this is writing your sql into the database. +.RE .SH "EXAMPLES" .eo diff --git a/doc/man/man5/slurmdbd.conf.5 b/doc/man/man5/slurmdbd.conf.5 index 29ae872f8f5f019cc829d4c147fbe485694d48f7..ec35744becfe5eb1132cb4bf9cdd4732392afb24 100644 --- a/doc/man/man5/slurmdbd.conf.5 +++ b/doc/man/man5/slurmdbd.conf.5 @@ -27,6 +27,10 @@ If ArchiveScript is not set the slurmdbd will generate a text file that can be read in anytime with sacctmgr load filename. This directory is where the file will be placed archive has ran. Default is /tmp. +.TP +\fBArchiveEvents\fR +Boolean, yes to archive event data, no other wise. Default is no. + .TP \fBArchiveJobs\fR Boolean, yes to archive job data, no other wise. Default is no. @@ -38,23 +42,40 @@ records out of the database into an archive. The script is executed with a no arguments, The following environment variables are set. .RS .TP -\fBSLURM_ARCHIVE_STEPS\fR -1 for archive steps 0 otherwise. +\fBSLURM_ARCHIVE_EVENTS\fR +1 for archive events 0 otherwise. .TP -\fBSLURM_ARCHIVE_LAST_STEP\fR -Time of last step start to archive. +\fBSLURM_ARCHIVE_LAST_EVENT\fR +Time of last event start to archive. .TP \fBSLURM_ARCHIVE_JOBS\fR 1 for achive jobs 0 otherwise. .TP \fBSLURM_ARCHIVE_LAST_JOB\fR Time of last job submit to archive. +.TP +\fBSLURM_ARCHIVE_STEPS\fR +1 for archive steps 0 otherwise. +.TP +\fBSLURM_ARCHIVE_LAST_STEP\fR +Time of last step start to archive. +.TP +\fBSLURM_ARCHIVE_SUSPEND\fR +1 for archive suspend data 0 otherwise. +.TP +\fBSLURM_ARCHIVE_LAST_SUSPEND\fR +Time of last suspend start to archive. +.TP .RE .TP \fBArchiveSteps\fR Boolean, yes to archive step data, no other wise. Default is no. +.TP +\fBArchiveSuspend\fR +Boolean, yes to archive suspend data, no other wise. Default is no. + .TP \fBAuthInfo\fR Additional information to be used for authentication of communications @@ -121,13 +142,6 @@ The default value is 3. When adding a new cluster this will be used as the qos for the cluster unless something is explicitly set by the admin with the create. -.TP -\fBJobPurge\fR -Individual job records over this age are purged from the database. -Aggregated information will be preserved indefinitely. -The time is a numeric value and is a number of months. -If zero (default), then job records are never purged. - .TP \fBLogFile\fR Fully qualified pathname of a file into which the Slurm Database Daemon's @@ -188,26 +202,54 @@ but can only see themselves when listing users. .RE .TP -\fBSlurmUser\fR -The name of the user that the \fBslurmctld\fR daemon executes as. -This user must exist on the machine executing the Slurm Database Daemon -and have the same user ID as the hosts on which \fBslurmctld\fR execute. -For security purposes, a user other than "root" is recommended. -The default value is "root". +\fBPurgeEventMonths\fR +Events happening on the cluster over this age are purged from the database. +This includeds node down times and such. +The time is a numeric value and is a number of months. +If zero (default), then job step records are never purged. + +.TP +\fBPurgeJobMonths\fR +Individual job records over this age are purged from the database. +Aggregated information will be preserved indefinitely. +The time is a numeric value and is a number of months. +If zero (default), then job records are never purged. .TP -\fBStepPurge\fR +\fBPurgeStepMonths\fR Individual job step records over this age are purged from the database. Aggregated information will be preserved indefinitely. The time is a numeric value and is a number of months. If zero (default), then job step records are never purged. +.TP +\fBPurgeSuspendMonths\fR +Records of individual suspend times for jobs over this age are purged from the +database. +Aggregated information will be preserved indefinitely. +The time is a numeric value and is a number of months. +If zero (default), then job step records are never purged. + +.TP +\fBSlurmUser\fR +The name of the user that the \fBslurmctld\fR daemon executes as. +This user must exist on the machine executing the Slurm Database Daemon +and have the same user ID as the hosts on which \fBslurmctld\fR execute. +For security purposes, a user other than "root" is recommended. +The default value is "root". + .TP \fBStorageHost\fR Define the name of the host the database is running where we are going to store the data. Ideally this should be the host on which slurmdbd executes. +.TP +\fBStorageBackupHost\fR +Define the name of the backup host the database is running where we are going +to store the data. +Default is none. + .TP \fBStorageLoc\fR Specify the name of the database as the location where accounting @@ -262,10 +304,14 @@ Characterization Key. Must be set to track wckey usage. .br # .br +ArchiveEvents=yes +.br ArchiveJobs=yes .br ArchiveSteps=no .br +ArchiveSuspend=no +.br #ArchiveScript=/usr/sbin/slurm.dbd.archive .br AuthInfo=/var/run/munge/munge.socket.2 @@ -276,9 +322,13 @@ DbdHost=db_host .br DebugLevel=4 .br -JobPurge=12 +PurgeEventMonths=1 +.br +PurgeJobMonths=12 +.br +PurgeStepMonths=1 .br -StepPurge=1 +PurgeSuspendMonths=1 .br LogFile=/var/log/slurmdbd.log .br diff --git a/src/common/slurm_accounting_storage.c b/src/common/slurm_accounting_storage.c index c31aab541eaae5ca05b786aa2d463faca1cf395d..d185de5c7aea0c394ebb99c410b6924c86e80435 100644 --- a/src/common/slurm_accounting_storage.c +++ b/src/common/slurm_accounting_storage.c @@ -6763,21 +6763,29 @@ extern void pack_acct_archive_cond(void *in, uint16_t rpc_version, Buf buffer) if(!object) { packnull(buffer); pack16((uint16_t)NO_VAL, buffer); + pack16((uint16_t)NO_VAL, buffer); packnull(buffer); pack16((uint16_t)NO_VAL, buffer); + pack16((uint16_t)NO_VAL, buffer); pack_acct_job_cond(NULL, rpc_version, buffer); pack16((uint16_t)NO_VAL, buffer); pack16((uint16_t)NO_VAL, buffer); + pack16((uint16_t)NO_VAL, buffer); + pack16((uint16_t)NO_VAL, buffer); return; } packstr(object->archive_dir, buffer); + pack16(object->archive_events, buffer); pack16(object->archive_jobs, buffer); packstr(object->archive_script, buffer); pack16(object->archive_steps, buffer); + pack16(object->archive_suspend, buffer); pack_acct_job_cond(object->job_cond, rpc_version, buffer); - pack16(object->job_purge, buffer); - pack16(object->step_purge, buffer); + pack16(object->purge_event, buffer); + pack16(object->purge_job, buffer); + pack16(object->purge_step, buffer); + pack16(object->purge_suspend, buffer); } extern int unpack_acct_archive_cond(void **object, uint16_t rpc_version, @@ -6790,15 +6798,19 @@ extern int unpack_acct_archive_cond(void **object, uint16_t rpc_version, *object = object_ptr; safe_unpackstr_xmalloc(&object_ptr->archive_dir, &uint32_tmp, buffer); + safe_unpack16(&object_ptr->archive_events, buffer); safe_unpack16(&object_ptr->archive_jobs, buffer); safe_unpackstr_xmalloc(&object_ptr->archive_script, &uint32_tmp, buffer); safe_unpack16(&object_ptr->archive_steps, buffer); + safe_unpack16(&object_ptr->archive_suspend, buffer); if(unpack_acct_job_cond((void *)&object_ptr->job_cond, rpc_version, buffer) == SLURM_ERROR) goto unpack_error; - safe_unpack16(&object_ptr->job_purge, buffer); - safe_unpack16(&object_ptr->step_purge, buffer); + safe_unpack16(&object_ptr->purge_event, buffer); + safe_unpack16(&object_ptr->purge_job, buffer); + safe_unpack16(&object_ptr->purge_step, buffer); + safe_unpack16(&object_ptr->purge_suspend, buffer); return SLURM_SUCCESS; diff --git a/src/common/slurm_accounting_storage.h b/src/common/slurm_accounting_storage.h index 09a49134921280b7c98535db03e2425c6bf52955..621e5b34699e730c7d5a4df9c9fb06aec07aaa16 100644 --- a/src/common/slurm_accounting_storage.h +++ b/src/common/slurm_accounting_storage.h @@ -512,6 +512,9 @@ typedef struct { typedef struct { char *archive_dir; /* location to place archive file */ + uint16_t archive_events; /* whether or not to keep an archive + file of events that can be loaded + later */ uint16_t archive_jobs; /* whether or not to keep an archive file of jobs that can be loaded later */ @@ -520,9 +523,15 @@ typedef struct { uint16_t archive_steps; /* whether or not to keep an archive file of steps that can be loaded later */ + uint16_t archive_suspend; /* whether or not to keep an archive + file of suspend data that can be loaded + later */ acct_job_cond_t *job_cond; /* conditions for the jobs to archive */ - uint16_t job_purge; /* purge jobs older than this in months */ - uint16_t step_purge; /* purge steps older than this in months */ + uint16_t purge_event; /* purge events older than this in months */ + uint16_t purge_job; /* purge jobs older than this in months */ + uint16_t purge_step; /* purge steps older than this in months */ + uint16_t purge_suspend; /* purge suspend data older than this + * in months */ } acct_archive_cond_t; typedef struct { diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 8fc6f60379a40af7f94767db7beae17a43673033..be2b3db1313f0ad4a07cfaad6bd4e5ca2f958356 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -3982,10 +3982,9 @@ extern int acct_storage_p_add_clusters(mysql_conn_t *mysql_conn, uint32_t uid, continue; } - xstrcat(cols, "creation_time, mod_time, acct, " - "cluster, classification"); + xstrcat(cols, "creation_time, mod_time, acct, cluster"); xstrfmtcat(vals, "%d, %d, 'root', \"%s\"", - now, now, object->name, object->classification); + now, now, object->name); xstrfmtcat(extra, ", mod_time=%d", now); if(object->root_assoc) _setup_association_limits(object->root_assoc, &cols, diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c index a9334ec82d135335c05c6f33a7c2edd0de586e7c..b864235a6dd77f5fcb18050c8a17e1df4ea5c98b 100644 --- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c @@ -86,6 +86,130 @@ static int _write_to_file(int fd, char *data) return rc; } +static int _write_archive_file(MYSQL_RES *result, int start_col, int col_count, + time_t curr_end, char *arch_dir, + char *arch_type, char *insert, + bool with_deleted) +{ + int period_start = 0, fd = 0; + int rc = SLURM_SUCCESS; + MYSQL_ROW row; + struct tm time_tm; + char *old_file = NULL, *new_file = NULL, *reg_file = NULL; + char *values = NULL; + char start_char[32]; + char end_char[32]; + int i=0; + + xassert(result); + + //START_TIMER; + slurm_mutex_lock(&local_file_lock); + while((row = mysql_fetch_row(result))) { + if(period_start) { + xstrcat(values, ",\n("); + } else { + period_start = atoi(row[start_col]); + localtime_r((time_t *)&period_start, &time_tm); + time_tm.tm_sec = 0; + time_tm.tm_min = 0; + time_tm.tm_hour = 0; + time_tm.tm_mday = 1; + time_tm.tm_isdst = -1; + period_start = mktime(&time_tm); + localtime_r((time_t *)&period_start, &time_tm); + snprintf(start_char, sizeof(start_char), + "%4.4u-%2.2u-%2.2u" + "T%2.2u:%2.2u:%2.2u", + (time_tm.tm_year + 1900), + (time_tm.tm_mon+1), + time_tm.tm_mday, + time_tm.tm_hour, + time_tm.tm_min, + time_tm.tm_sec); + + localtime_r((time_t *)&curr_end, &time_tm); + snprintf(end_char, sizeof(end_char), + "%4.4u-%2.2u-%2.2u" + "T%2.2u:%2.2u:%2.2u", + (time_tm.tm_year + 1900), + (time_tm.tm_mon+1), + time_tm.tm_mday, + time_tm.tm_hour, + time_tm.tm_min, + time_tm.tm_sec); + + /* write the buffer to file */ + reg_file = xstrdup_printf( + "%s/%s_archive_%s_%s.sql", + arch_dir, arch_type, + start_char, end_char); + debug("Storing event archive at %s", reg_file); + old_file = xstrdup_printf("%s.old", reg_file); + new_file = xstrdup_printf("%s.new", reg_file); + + fd = creat(new_file, 0600); + if (fd == 0) { + error("Can't save archive, " + "create file %s error %m", + new_file); + rc = errno; + xfree(insert); + break; + } + values = xstrdup_printf("%s\nvalues\n(", insert); + } + + xstrfmtcat(values, "'%s'", row[0]); + for(i=1; i<col_count; i++) { + xstrfmtcat(values, ", '%s'", row[i]); + } + + if(with_deleted) + xstrcat(values, ", '1')"); + else + xstrcat(values, ")"); + + if(!fd + || ((rc = _write_to_file(fd, values)) != SLURM_SUCCESS)) { + xfree(values); + break; + } + xfree(values); + } + + if(with_deleted) + rc = _write_to_file(fd, + " on duplicate key update " + "deleted=1;"); + else + rc = _write_to_file(fd, + " on duplicate key update " + "period_end=VALUES(period_end);"); +// END_TIMER2("write file"); +// info("write file took %s", TIME_STR); + + fsync(fd); + close(fd); + + if (rc) + (void) unlink(new_file); + else { /* file shuffle */ + int ign; /* avoid warning */ + (void) unlink(old_file); + ign = link(reg_file, old_file); + (void) unlink(reg_file); + ign = link(new_file, reg_file); + (void) unlink(new_file); + } + xfree(old_file); + xfree(reg_file); + xfree(new_file); + slurm_mutex_unlock(&local_file_lock); + + return rc; +} + static int _archive_script(acct_archive_cond_t *arch_cond, time_t last_submit) { char * args[] = {arch_cond->archive_script, NULL}; @@ -124,30 +248,31 @@ static int _archive_script(acct_archive_cond_t *arch_cond, time_t last_submit) env = env_array_create(); - if(arch_cond->step_purge) { + if(arch_cond->purge_event) { /* use localtime to avoid any daylight savings issues */ if(!localtime_r(&last_submit, &time_tm)) { - error("Couldn't get localtime from first step start %d", + error("Couldn't get localtime from " + "first event start %d", last_submit); return SLURM_ERROR; } - time_tm.tm_mon -= arch_cond->step_purge; + time_tm.tm_mon -= arch_cond->purge_step; time_tm.tm_isdst = -1; curr_end = mktime(&time_tm); - env_array_append_fmt(&env, "SLURM_ARCHIVE_STEPS", "%u", - arch_cond->archive_steps); - env_array_append_fmt(&env, "SLURM_ARCHIVE_LAST_STEP", "%d", + env_array_append_fmt(&env, "SLURM_ARCHIVE_EVENTS", "%u", + arch_cond->archive_events); + env_array_append_fmt(&env, "SLURM_ARCHIVE_LAST_EVENT", "%d", curr_end); } - if(arch_cond->job_purge) { + if(arch_cond->purge_job) { /* use localtime to avoid any daylight savings issues */ if(!localtime_r(&last_submit, &time_tm)) { error("Couldn't get localtime from first start %d", last_submit); return SLURM_ERROR; } - time_tm.tm_mon -= arch_cond->job_purge; + time_tm.tm_mon -= arch_cond->purge_job; time_tm.tm_isdst = -1; curr_end = mktime(&time_tm); @@ -157,6 +282,39 @@ static int _archive_script(acct_archive_cond_t *arch_cond, time_t last_submit) curr_end); } + if(arch_cond->purge_step) { + /* use localtime to avoid any daylight savings issues */ + if(!localtime_r(&last_submit, &time_tm)) { + error("Couldn't get localtime from first step start %d", + last_submit); + return SLURM_ERROR; + } + time_tm.tm_mon -= arch_cond->purge_step; + time_tm.tm_isdst = -1; + curr_end = mktime(&time_tm); + env_array_append_fmt(&env, "SLURM_ARCHIVE_STEPS", "%u", + arch_cond->archive_steps); + env_array_append_fmt(&env, "SLURM_ARCHIVE_LAST_STEP", "%d", + curr_end); + } + + if(arch_cond->purge_suspend) { + /* use localtime to avoid any daylight savings issues */ + if(!localtime_r(&last_submit, &time_tm)) { + error("Couldn't get localtime from first " + "suspend start %d", + last_submit); + return SLURM_ERROR; + } + time_tm.tm_mon -= arch_cond->purge_step; + time_tm.tm_isdst = -1; + curr_end = mktime(&time_tm); + env_array_append_fmt(&env, "SLURM_ARCHIVE_SUSPEND", "%u", + arch_cond->archive_steps); + env_array_append_fmt(&env, "SLURM_ARCHIVE_LAST_SUSPEND", "%d", + curr_end); + } + #ifdef _PATH_STDPATH env_array_append (&env, "PATH", _PATH_STDPATH); #else @@ -1237,18 +1395,29 @@ extern List mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, uid_t uid, extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, acct_archive_cond_t *arch_cond) { - int rc = SLURM_SUCCESS, fd = 0; + int rc = SLURM_SUCCESS; char *query = NULL; time_t last_submit = time(NULL); time_t curr_end; char *tmp = NULL; int i=0; - char *old_file = NULL, *new_file = NULL, *reg_file = NULL; struct tm time_tm; - char start_char[32]; - char end_char[32]; + // DEF_TIMERS; + /* if this changes you will need to edit the corresponding + * enum below */ + char *event_req_inx[] = { + "node_name", + "cluster", + "cpu_count", + "state", + "period_start", + "period_end", + "reason", + "cluster_nodes", + }; + /* if this changes you will need to edit the corresponding * enum below */ char *job_req_inx[] = { @@ -1276,7 +1445,9 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, "priority", "req_cpus", "alloc_cpus", + "alloc_nodes", "nodelist", + "node_inx", "kill_requid", "qos" }; @@ -1291,11 +1462,14 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, "suspended", "name", "nodelist", + "node_inx", "state", "kill_requid", "comp_code", + "nodes", "cpus", "tasks", + "task_dist", "user_sec", "user_usec", "sys_sec", @@ -1318,6 +1492,28 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, "ave_cpu" }; + + /* if this changes you will need to edit the corresponding + * enum below */ + char *suspend_req_inx[] = { + "id", + "associd", + "start", + "end", + }; + + enum { + EVENT_REQ_NODE, + EVENT_REQ_CLUSTER, + EVENT_REQ_CPUS, + EVENT_REQ_STATE, + EVENT_REQ_START, + EVENT_REQ_END, + EVENT_REQ_REASON, + EVENT_REQ_NODES, + EVENT_REQ_COUNT + }; + enum { JOB_REQ_ID, JOB_REQ_JOBID, @@ -1343,11 +1539,14 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, JOB_REQ_PRIORITY, JOB_REQ_REQ_CPUS, JOB_REQ_ALLOC_CPUS, + JOB_REQ_ALLOC_NODES, JOB_REQ_NODELIST, + JOB_REQ_NODE_INX, JOB_REQ_KILL_REQUID, JOB_REQ_QOS, JOB_REQ_COUNT }; + enum { STEP_REQ_ID, STEP_REQ_STEPID, @@ -1356,11 +1555,14 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, STEP_REQ_SUSPENDED, STEP_REQ_NAME, STEP_REQ_NODELIST, + STEP_REQ_NODE_INX, STEP_REQ_STATE, STEP_REQ_KILL_REQUID, STEP_REQ_COMP_CODE, + STEP_REQ_NODES, STEP_REQ_CPUS, STEP_REQ_TASKS, + STEP_REQ_TASKDIST, STEP_REQ_USER_SEC, STEP_REQ_USER_USEC, STEP_REQ_SYS_SEC, @@ -1384,6 +1586,14 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, STEP_REQ_COUNT }; + enum { + SUSPEND_REQ_ID, + SUSPEND_REQ_ASSOCID, + SUSPEND_REQ_START, + SUSPEND_REQ_END, + SUSPEND_REQ_COUNT + }; + if(!arch_cond) { error("No arch_cond was given to archive from. returning"); return SLURM_ERROR; @@ -1410,9 +1620,176 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, return SLURM_ERROR; } - if(arch_cond->step_purge) { + if(arch_cond->purge_event) { /* remove all data from step table that was older than - * start * arch_cond->step_purge. + * period_start * arch_cond->purge_event. + */ + /* use localtime to avoid any daylight savings issues */ + if(!localtime_r(&last_submit, &time_tm)) { + error("Couldn't get localtime from first submit %d", + last_submit); + return SLURM_ERROR; + } + time_tm.tm_mday = 1; + time_tm.tm_mon -= arch_cond->purge_event; + time_tm.tm_isdst = -1; + curr_end = mktime(&time_tm); + + debug4("from %d - %d months purging events from before %d", + last_submit, arch_cond->purge_event, curr_end); + + if(arch_cond->archive_events) { + char *insert = NULL; + MYSQL_RES *result = NULL; + + xfree(tmp); + xstrfmtcat(tmp, "%s", event_req_inx[0]); + for(i=1; i<EVENT_REQ_COUNT; i++) { + xstrfmtcat(tmp, ", %s", event_req_inx[i]); + } + + /* get all the events started before this time + listed */ + query = xstrdup_printf("select %s from %s where " + "period_start <= %d " + "&& period_end != 0 " + "order by period_start asc", + tmp, event_table, curr_end); + + insert = xstrdup_printf("insert into %s (%s) ", + event_table, tmp); + xfree(tmp); + +// START_TIMER; + debug3("%d(%d) query\n%s", mysql_conn->conn, + __LINE__, query); + if(!(result = mysql_db_query_ret( + mysql_conn->db_conn, query, 0))) { + xfree(insert); + xfree(query); + return SLURM_ERROR; + } + xfree(query); +// END_TIMER2("step query"); +// info("event query took %s", TIME_STR); + + if(!mysql_num_rows(result)) { + xfree(insert); + mysql_free_result(result); + goto exit_events; + } + + rc = _write_archive_file( + result, EVENT_REQ_START, EVENT_REQ_COUNT, + curr_end, arch_cond->archive_dir, + "event", insert, false); + + xfree(insert); + mysql_free_result(result); + + if(rc != SLURM_SUCCESS) + return rc; + } + query = xstrdup_printf("delete from %s where " + "period_start <= %d && period_end != 0", + event_table, curr_end); + debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); + rc = mysql_db_query(mysql_conn->db_conn, query); + xfree(query); + if(rc != SLURM_SUCCESS) { + error("Couldn't remove old event data"); + return SLURM_ERROR; + } + } + +exit_events: + + if(arch_cond->purge_suspend) { + /* remove all data from step table that was older than + * period_start * arch_cond->purge_suspend. + */ + /* use localtime to avoid any daylight savings issues */ + if(!localtime_r(&last_submit, &time_tm)) { + error("Couldn't get localtime from first submit %d", + last_submit); + return SLURM_ERROR; + } + time_tm.tm_mday = 1; + time_tm.tm_mon -= arch_cond->purge_suspend; + time_tm.tm_isdst = -1; + curr_end = mktime(&time_tm); + + debug4("from %d - %d months purging suspend from before %d", + last_submit, arch_cond->purge_suspend, curr_end); + + if(arch_cond->archive_suspend) { + char *insert = NULL; + MYSQL_RES *result = NULL; + + xfree(tmp); + xstrfmtcat(tmp, "%s", suspend_req_inx[0]); + for(i=1; i<SUSPEND_REQ_COUNT; i++) { + xstrfmtcat(tmp, ", %s", suspend_req_inx[i]); + } + + /* get all the suspend started before this time + listed */ + query = xstrdup_printf("select %s from %s where " + "start <= %d && end != 0 " + "order by start asc", + tmp, suspend_table, curr_end); + + insert = xstrdup_printf("insert into %s (%s) ", + suspend_table, tmp); + xfree(tmp); + +// START_TIMER; + debug3("%d(%d) query\n%s", mysql_conn->conn, + __LINE__, query); + if(!(result = mysql_db_query_ret( + mysql_conn->db_conn, query, 0))) { + xfree(insert); + xfree(query); + return SLURM_ERROR; + } + xfree(query); +// END_TIMER2("step query"); +// info("suspend query took %s", TIME_STR); + + if(!mysql_num_rows(result)) { + xfree(insert); + mysql_free_result(result); + goto exit_suspend; + } + + rc = _write_archive_file( + result, SUSPEND_REQ_START, SUSPEND_REQ_COUNT, + curr_end, arch_cond->archive_dir, + "suspend", insert, false); + + xfree(insert); + mysql_free_result(result); + + if(rc != SLURM_SUCCESS) + return rc; + } + query = xstrdup_printf("delete from %s where start <= %d " + "&& end != 0", + suspend_table, curr_end); + debug3("%d(%d) query\n%s", mysql_conn->conn, __LINE__, query); + rc = mysql_db_query(mysql_conn->db_conn, query); + xfree(query); + if(rc != SLURM_SUCCESS) { + error("Couldn't remove old suspend data"); + return SLURM_ERROR; + } + } + +exit_suspend: + + if(arch_cond->purge_step) { + /* remove all data from step table that was older than + * start * arch_cond->purge_step. */ /* use localtime to avoid any daylight savings issues */ if(!localtime_r(&last_submit, &time_tm)) { @@ -1420,19 +1797,16 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, last_submit); return SLURM_ERROR; } - time_tm.tm_mon -= arch_cond->step_purge; + time_tm.tm_mon -= arch_cond->purge_step; time_tm.tm_isdst = -1; curr_end = mktime(&time_tm); debug4("from %d - %d months purging steps from before %d", - last_submit, arch_cond->step_purge, curr_end); + last_submit, arch_cond->purge_step, curr_end); if(arch_cond->archive_steps) { char *insert = NULL; - char *values = NULL; - int period_start = 0; MYSQL_RES *result = NULL; - MYSQL_ROW row; xfree(tmp); xstrfmtcat(tmp, "%s", step_req_inx[0]); @@ -1471,116 +1845,19 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, mysql_free_result(result); goto exit_steps; } - -// START_TIMER; - slurm_mutex_lock(&local_file_lock); - while((row = mysql_fetch_row(result))) { - if(period_start) { - xstrcat(values, ",\n("); - } else { - period_start = - atoi(row[STEP_REQ_START]); - localtime_r((time_t *)&period_start, - &time_tm); - time_tm.tm_sec = 0; - time_tm.tm_min = 0; - time_tm.tm_hour = 0; - time_tm.tm_mday = 1; - time_tm.tm_isdst = -1; - period_start = mktime(&time_tm); - localtime_r((time_t *)&period_start, - &time_tm); - snprintf(start_char, sizeof(start_char), - "%4.4u-%2.2u-%2.2u" - "T%2.2u:%2.2u:%2.2u", - (time_tm.tm_year + 1900), - (time_tm.tm_mon+1), - time_tm.tm_mday, - time_tm.tm_hour, - time_tm.tm_min, - time_tm.tm_sec); - - localtime_r((time_t *)&curr_end, - &time_tm); - snprintf(end_char, sizeof(end_char), - "%4.4u-%2.2u-%2.2u" - "T%2.2u:%2.2u:%2.2u", - (time_tm.tm_year + 1900), - (time_tm.tm_mon+1), - time_tm.tm_mday, - time_tm.tm_hour, - time_tm.tm_min, - time_tm.tm_sec); - - /* write the buffer to file */ - reg_file = xstrdup_printf( - "%s/step_archive_%s_%s.sql", - arch_cond->archive_dir, - start_char, end_char); - debug("Storing step archive at %s", - reg_file); - old_file = xstrdup_printf( - "%s.old", reg_file); - new_file = xstrdup_printf( - "%s.new", reg_file); - - fd = creat(new_file, 0600); - if (fd == 0) { - error("Can't save archive, " - "create file %s error %m", - new_file); - rc = errno; - xfree(insert); - break; - } - values = xstrdup_printf("%s\nvalues\n(", - insert); - xfree(insert); - } - - xstrfmtcat(values, "'%s'", row[0]); - for(i=1; i<STEP_REQ_COUNT; i++) { - xstrfmtcat(values, ", '%s'", row[i]); - } - xstrcat(values, ", '1')"); - - if(!fd || ((rc = _write_to_file(fd, values)) - != SLURM_SUCCESS)) { - xfree(values); - break; - } - xfree(values); - } - mysql_free_result(result); - rc = _write_to_file( - fd, " on duplicate key update deleted=1;"); -// END_TIMER2("write file"); -// info("write file took %s", TIME_STR); - - fsync(fd); - close(fd); - if (rc) - (void) unlink(new_file); - else { /* file shuffle */ - int ign; /* avoid warning */ - (void) unlink(old_file); - ign = link(reg_file, old_file); - (void) unlink(reg_file); - ign = link(new_file, reg_file); - (void) unlink(new_file); - } - xfree(old_file); - xfree(reg_file); - xfree(new_file); - slurm_mutex_unlock(&local_file_lock); + rc = _write_archive_file( + result, STEP_REQ_START, STEP_REQ_COUNT, + curr_end, arch_cond->archive_dir, + "step", insert, true); + + xfree(insert); + mysql_free_result(result); - period_start = 0; + if(rc != SLURM_SUCCESS) + return rc; } - if(rc != SLURM_SUCCESS) - return rc; - query = xstrdup_printf("delete from %s where start <= %d " "&& end != 0", step_table, curr_end); @@ -1594,9 +1871,9 @@ extern int mysql_jobacct_process_archive(mysql_conn_t *mysql_conn, } exit_steps: - if(arch_cond->job_purge) { + if(arch_cond->purge_job) { /* remove all data from step table that was older than - * last_submit * arch_cond->job_purge. + * last_submit * arch_cond->purge_job. */ /* use localtime to avoid any daylight savings issues */ if(!localtime_r(&last_submit, &time_tm)) { @@ -1605,20 +1882,17 @@ exit_steps: return SLURM_ERROR; } time_tm.tm_mday = 1; - time_tm.tm_mon -= arch_cond->job_purge; + time_tm.tm_mon -= arch_cond->purge_job; time_tm.tm_isdst = -1; curr_end = mktime(&time_tm); debug4("from %d - %d months purging jobs from before %d", - last_submit, arch_cond->job_purge, curr_end); + last_submit, arch_cond->purge_job, curr_end); if(arch_cond->archive_jobs) { char *insert = NULL; - char *values = NULL; - int period_start = 0; MYSQL_RES *result = NULL; - MYSQL_ROW row; - + xfree(tmp); xstrfmtcat(tmp, "%s", job_req_inx[0]); for(i=1; i<JOB_REQ_COUNT; i++) { @@ -1656,114 +1930,18 @@ exit_steps: goto exit_jobs; } -// START_TIMER; - slurm_mutex_lock(&local_file_lock); - while((row = mysql_fetch_row(result))) { - if(period_start) { - xstrcat(values, ",\n("); - } else { - period_start = - atoi(row[JOB_REQ_SUBMIT]); - localtime_r((time_t *)&period_start, - &time_tm); - time_tm.tm_sec = 0; - time_tm.tm_min = 0; - time_tm.tm_hour = 0; - time_tm.tm_mday = 1; - time_tm.tm_isdst = -1; - period_start = mktime(&time_tm); - localtime_r((time_t *)&period_start, - &time_tm); - snprintf(start_char, sizeof(start_char), - "%4.4u-%2.2u-%2.2u" - "T%2.2u:%2.2u:%2.2u", - (time_tm.tm_year + 1900), - (time_tm.tm_mon+1), - time_tm.tm_mday, - time_tm.tm_hour, - time_tm.tm_min, - time_tm.tm_sec); - - localtime_r((time_t *)&curr_end, - &time_tm); - - snprintf(end_char, sizeof(end_char), - "%4.4u-%2.2u-%2.2u" - "T%2.2u:%2.2u:%2.2u", - (time_tm.tm_year + 1900), - (time_tm.tm_mon+1), - time_tm.tm_mday, - time_tm.tm_hour, - time_tm.tm_min, - time_tm.tm_sec); - - /* write the buffer to file */ - reg_file = xstrdup_printf( - "%s/job_archive_%s_%s.sql", - arch_cond->archive_dir, - start_char, end_char); - debug("Storing job archive at %s", - reg_file); - old_file = xstrdup_printf( - "%s.old", reg_file); - new_file = xstrdup_printf( - "%s.new", reg_file); - - fd = creat(new_file, 0600); - if (fd == 0) { - error("Can't save archive, " - "create file %s error %m", - new_file); - rc = errno; - xfree(insert); - break; - } - values = xstrdup_printf("%s\nvalues\n(", - insert); - xfree(insert); - } - - xstrfmtcat(values, "'%s'", row[0]); - for(i=1; i<JOB_REQ_COUNT; i++) { - xstrfmtcat(values, ", '%s'", row[i]); - } - xstrcat(values, ", '1')"); - - if(!fd || ((rc = _write_to_file(fd, values)) - != SLURM_SUCCESS)) { - xfree(values); - break; - } - xfree(values); - } - mysql_free_result(result); - - rc = _write_to_file( - fd, " on duplicate key update deleted=1;"); -// END_TIMER2("write file"); -// info("write file took %s", TIME_STR); - + rc = _write_archive_file( + result, JOB_REQ_SUBMIT, JOB_REQ_COUNT, + curr_end, arch_cond->archive_dir, + "job", insert, true); - fsync(fd); - close(fd); - - if (rc) - (void) unlink(new_file); - else { /* file shuffle */ - int ign; /* avoid warning */ - (void) unlink(old_file); - ign = link(reg_file, old_file); - (void) unlink(reg_file); - ign = link(new_file, reg_file); - (void) unlink(new_file); - } - xfree(old_file); - xfree(reg_file); - xfree(new_file); - slurm_mutex_unlock(&local_file_lock); + xfree(insert); + mysql_free_result(result); - period_start = 0; + if(rc != SLURM_SUCCESS) + return rc; } + query = xstrdup_printf("delete from %s where submit <= %d " "&& end != 0", job_table, curr_end); diff --git a/src/plugins/accounting_storage/mysql/mysql_rollup.c b/src/plugins/accounting_storage/mysql/mysql_rollup.c index ea35654fec80c25fe5da85e9ef68fae678885ee8..3f2e2c8d6c9e1f5b3dc7cc536cbd4bda8e0e1b21 100644 --- a/src/plugins/accounting_storage/mysql/mysql_rollup.c +++ b/src/plugins/accounting_storage/mysql/mysql_rollup.c @@ -1012,24 +1012,6 @@ extern int mysql_daily_rollup(mysql_conn_t *mysql_conn, start_tm.tm_isdst = -1; curr_end = mktime(&start_tm); } - - /* if we didn't ask for archive data return here and don't do - anything extra just rollup */ - - if(!archive_data) - return SLURM_SUCCESS; - - /* remove all data from suspend table that was older than - * start. - */ - query = xstrdup_printf("delete from %s where end < %d && end != 0", - suspend_table, start); - rc = mysql_db_query(mysql_conn->db_conn, query); - xfree(query); - if(rc != SLURM_SUCCESS) { - error("Couldn't remove old suspend data"); - return SLURM_ERROR; - } /* info("stop start %s", ctime(&curr_start)); */ /* info("stop end %s", ctime(&curr_end)); */ @@ -1138,28 +1120,20 @@ extern int mysql_monthly_rollup(mysql_conn_t *mysql_conn, if(!archive_data) return SLURM_SUCCESS; - /* remove all data from event table that was older than - * start. - */ - query = xstrdup_printf("delete from %s where period_end < %d " - "&& period_end != 0", - event_table, start); - rc = mysql_db_query(mysql_conn->db_conn, query); - xfree(query); - if(rc != SLURM_SUCCESS) { - error("Couldn't remove old event data"); - return SLURM_ERROR; - } if(!slurmdbd_conf) return SLURM_SUCCESS; memset(&arch_cond, 0, sizeof(arch_cond)); arch_cond.archive_dir = slurmdbd_conf->archive_dir; + arch_cond.archive_events = slurmdbd_conf->archive_events; arch_cond.archive_jobs = slurmdbd_conf->archive_jobs; arch_cond.archive_script = slurmdbd_conf->archive_script; arch_cond.archive_steps = slurmdbd_conf->archive_steps; - arch_cond.job_purge = slurmdbd_conf->job_purge; - arch_cond.step_purge = slurmdbd_conf->step_purge; + arch_cond.archive_suspend = slurmdbd_conf->archive_suspend; + arch_cond.purge_event = slurmdbd_conf->purge_event; + arch_cond.purge_job = slurmdbd_conf->purge_job; + arch_cond.purge_step = slurmdbd_conf->purge_step; + arch_cond.purge_suspend = slurmdbd_conf->purge_suspend; return mysql_jobacct_process_archive(mysql_conn, &arch_cond); } diff --git a/src/sacctmgr/archive_functions.c b/src/sacctmgr/archive_functions.c index b4458c2044851ac46008ad8cf0cad6c50a100009..25caeb5fcce189fbb29ae2f4319fe7d3c26112b6 100644 --- a/src/sacctmgr/archive_functions.c +++ b/src/sacctmgr/archive_functions.c @@ -178,6 +178,10 @@ static int _set_cond(int *start, int argc, char *argv[], if(!end && !strncasecmp(argv[i], "where", MAX(command_len, 5))) { continue; + } else if(!end && !strncasecmp(argv[i], "events", + MAX(command_len, 1))) { + arch_cond->archive_events = 1; + set = 1; } else if(!end && !strncasecmp(argv[i], "jobs", MAX(command_len, 1))) { arch_cond->archive_jobs = 1; @@ -186,6 +190,10 @@ static int _set_cond(int *start, int argc, char *argv[], MAX(command_len, 1))) { arch_cond->archive_steps = 1; set = 1; + } else if(!end && !strncasecmp(argv[i], "suspend", + MAX(command_len, 1))) { + arch_cond->archive_suspend = 1; + set = 1; } else if(!end || !strncasecmp (argv[i], "Clusters", MAX(command_len, 1))) { @@ -264,18 +272,34 @@ static int _set_cond(int *start, int argc, char *argv[], slurm_addto_char_list(job_cond->partition_list, argv[i]+end); set = 1; - } else if (!strncasecmp (argv[i], "PurgeJobsBefore", + } else if (!strncasecmp (argv[i], "PurgeEventMonths", MAX(command_len, 6))) { - if (get_uint16(argv[i]+end, &arch_cond->job_purge, - "PurgeJobsBefore") + if (get_uint16(argv[i]+end, &arch_cond->purge_event, + "PurgeEventMonths") != SLURM_SUCCESS) { exit_code = 1; } else set = 1; - } else if (!strncasecmp (argv[i], "PurgeStepsBefore", + } else if (!strncasecmp (argv[i], "PurgeJobMonths", MAX(command_len, 6))) { - if (get_uint16(argv[i]+end, &arch_cond->step_purge, - "PurgeStepsBefore") + if (get_uint16(argv[i]+end, &arch_cond->purge_job, + "PurgeJobMonths") + != SLURM_SUCCESS) { + exit_code = 1; + } else + set = 1; + } else if (!strncasecmp (argv[i], "PurgeStepMonths", + MAX(command_len, 7))) { + if (get_uint16(argv[i]+end, &arch_cond->purge_step, + "PurgeStepMonths") + != SLURM_SUCCESS) { + exit_code = 1; + } else + set = 1; + } else if (!strncasecmp (argv[i], "PurgeSuspendMonths", + MAX(command_len, 7))) { + if (get_uint16(argv[i]+end, &arch_cond->purge_suspend, + "PurgeSuspendMonths") != SLURM_SUCCESS) { exit_code = 1; } else @@ -315,10 +339,14 @@ extern int sacctmgr_archive_dump(int argc, char *argv[]) int i=0, set=0; struct stat st; + arch_cond->archive_events = (uint16_t)NO_VAL; arch_cond->archive_jobs = (uint16_t)NO_VAL; arch_cond->archive_steps = (uint16_t)NO_VAL; - arch_cond->job_purge = (uint16_t)NO_VAL; - arch_cond->step_purge = (uint16_t)NO_VAL; + arch_cond->archive_suspend = (uint16_t)NO_VAL; + arch_cond->purge_event = (uint16_t)NO_VAL; + arch_cond->purge_job = (uint16_t)NO_VAL; + arch_cond->purge_step = (uint16_t)NO_VAL; + arch_cond->purge_suspend = (uint16_t)NO_VAL; set = _set_cond(&i, argc, argv, arch_cond); if(exit_code) { diff --git a/src/sacctmgr/cluster_functions.c b/src/sacctmgr/cluster_functions.c index f6b235593ac3d854402a1eaa5fb6323ccc6d65c9..9d1c9a6f742e2333b74cd93dc836c15f51491f25 100644 --- a/src/sacctmgr/cluster_functions.c +++ b/src/sacctmgr/cluster_functions.c @@ -990,7 +990,11 @@ extern int sacctmgr_delete_cluster(int argc, char *argv[]) } if(!list_count(cluster_cond->cluster_list) - || !cluster_cond->classification) { + && !cluster_cond->classification) { + exit_code=1; + fprintf(stderr, + "problem with delete request. " + "Nothing given to delete.\n"); destroy_acct_cluster_cond(cluster_cond); return SLURM_SUCCESS; } diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index a3b7fcd5bd0cd8b617b5b3ff67f62a3c90c671ce..1e456d29349843f257ba400fa09c1afc8c427739 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -857,8 +857,9 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ list wckey - Clusters=, End=, Format=, IDs=, Names=, \n\ Start=, User=, and WCKeys= \n\ \n\ - archive dump - Directory=, Jobs, PurgeJobsBefore=, \n\ - PurgeStepsBefore=, Script=, and Steps \n\ + archive dump - Directory=, Events, Jobs, PurgeEventMonths=, \n\ + PurgeJobMonths=, PurgeStepMonths=, \n\ + PurgeSuspendMonths=, Script=, Steps and Suspend\n\ \n\ archive load - File=, or Insert= \n\ \n\ diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index a6996c7b72c76aa8a5a6fa140bf2de2e67a53a7b..a052d53970038c188d8d273db7f90d07c90c6407 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -869,6 +869,8 @@ static int _archive_dump(slurmdbd_conn_t *slurmdbd_conn, /* set up some defaults */ if(!arch_cond->archive_dir) arch_cond->archive_dir = xstrdup(slurmdbd_conf->archive_dir); + if(arch_cond->archive_events == (uint16_t)NO_VAL) + arch_cond->archive_events = slurmdbd_conf->archive_events; if(arch_cond->archive_jobs == (uint16_t)NO_VAL) arch_cond->archive_jobs = slurmdbd_conf->archive_jobs; if(!arch_cond->archive_script) @@ -876,10 +878,16 @@ static int _archive_dump(slurmdbd_conn_t *slurmdbd_conn, xstrdup(slurmdbd_conf->archive_script); if(arch_cond->archive_steps == (uint16_t)NO_VAL) arch_cond->archive_steps = slurmdbd_conf->archive_steps; - if(arch_cond->job_purge == (uint16_t)NO_VAL) - arch_cond->job_purge = slurmdbd_conf->job_purge; - if(arch_cond->step_purge == (uint16_t)NO_VAL) - arch_cond->step_purge = slurmdbd_conf->step_purge; + if(arch_cond->archive_suspend == (uint16_t)NO_VAL) + arch_cond->archive_suspend = slurmdbd_conf->archive_suspend; + if(arch_cond->purge_event == (uint16_t)NO_VAL) + arch_cond->purge_event = slurmdbd_conf->purge_event; + if(arch_cond->purge_job == (uint16_t)NO_VAL) + arch_cond->purge_job = slurmdbd_conf->purge_job; + if(arch_cond->purge_step == (uint16_t)NO_VAL) + arch_cond->purge_step = slurmdbd_conf->purge_step; + if(arch_cond->purge_suspend == (uint16_t)NO_VAL) + arch_cond->purge_suspend = slurmdbd_conf->purge_suspend; rc = jobacct_storage_g_archive(slurmdbd_conn->db_conn, arch_cond); if(rc != SLURM_SUCCESS) { diff --git a/src/slurmdbd/read_config.c b/src/slurmdbd/read_config.c index 28d201eb87dbeaa344e5a6c29cd4bb7bf8deee41..edac834022efbdf9783fb5245e5cc3695893b040 100644 --- a/src/slurmdbd/read_config.c +++ b/src/slurmdbd/read_config.c @@ -84,9 +84,11 @@ static void _clear_slurmdbd_conf(void) { if (slurmdbd_conf) { xfree(slurmdbd_conf->archive_dir); + slurmdbd_conf->archive_events = 0; slurmdbd_conf->archive_jobs = 0; xfree(slurmdbd_conf->archive_script); slurmdbd_conf->archive_steps = 0; + slurmdbd_conf->archive_suspend = 0; xfree(slurmdbd_conf->auth_info); xfree(slurmdbd_conf->auth_type); xfree(slurmdbd_conf->dbd_addr); @@ -95,14 +97,16 @@ static void _clear_slurmdbd_conf(void) slurmdbd_conf->dbd_port = 0; slurmdbd_conf->debug_level = 0; xfree(slurmdbd_conf->default_qos); - slurmdbd_conf->job_purge = 0; xfree(slurmdbd_conf->log_file); xfree(slurmdbd_conf->pid_file); xfree(slurmdbd_conf->plugindir); slurmdbd_conf->private_data = 0; + slurmdbd_conf->purge_event = 0; + slurmdbd_conf->purge_job = 0; + slurmdbd_conf->purge_step = 0; + slurmdbd_conf->purge_suspend = 0; slurmdbd_conf->slurm_user_id = NO_VAL; xfree(slurmdbd_conf->slurm_user_name); - slurmdbd_conf->step_purge = 0; xfree(slurmdbd_conf->storage_backup_host); xfree(slurmdbd_conf->storage_host); xfree(slurmdbd_conf->storage_loc); @@ -124,9 +128,11 @@ extern int read_slurmdbd_conf(void) { s_p_options_t options[] = { {"ArchiveDir", S_P_STRING}, + {"ArchiveEvents", S_P_BOOLEAN}, {"ArchiveJobs", S_P_BOOLEAN}, {"ArchiveScript", S_P_STRING}, {"ArchiveSteps", S_P_BOOLEAN}, + {"ArchiveSuspend", S_P_BOOLEAN}, {"AuthInfo", S_P_STRING}, {"AuthType", S_P_STRING}, {"DbdAddr", S_P_STRING}, @@ -141,6 +147,10 @@ extern int read_slurmdbd_conf(void) {"PidFile", S_P_STRING}, {"PluginDir", S_P_STRING}, {"PrivateData", S_P_STRING}, + {"PurgeEventMonths", S_P_UINT16}, + {"PurgeJobMonths", S_P_UINT16}, + {"PurgeStepMonths", S_P_UINT16}, + {"PurgeSuspendMonths", S_P_UINT16}, {"SlurmUser", S_P_STRING}, {"StepPurge", S_P_UINT16}, {"StorageBackupHost", S_P_STRING}, @@ -176,13 +186,15 @@ extern int read_slurmdbd_conf(void) tbl = s_p_hashtbl_create(options); if (s_p_parse_file(tbl, conf_path) == SLURM_ERROR) { fatal("Could not open/read/parse slurmdbd.conf file %s", - conf_path); + conf_path); } if(!s_p_get_string(&slurmdbd_conf->archive_dir, "ArchiveDir", tbl)) slurmdbd_conf->archive_dir = xstrdup(DEFAULT_SLURMDBD_ARCHIVE_DIR); + s_p_get_boolean((bool *)&slurmdbd_conf->archive_events, + "ArchiveEvents", tbl); s_p_get_boolean((bool *)&slurmdbd_conf->archive_jobs, "ArchiveJobs", tbl); s_p_get_string(&slurmdbd_conf->archive_script, "ArchiveScript", @@ -198,7 +210,7 @@ extern int read_slurmdbd_conf(void) s_p_get_uint16(&slurmdbd_conf->dbd_port, "DbdPort", tbl); s_p_get_uint16(&slurmdbd_conf->debug_level, "DebugLevel", tbl); s_p_get_string(&slurmdbd_conf->default_qos, "DefaultQOS", tbl); - s_p_get_uint16(&slurmdbd_conf->job_purge, "JobPurge", tbl); + s_p_get_uint16(&slurmdbd_conf->purge_job, "JobPurge", tbl); s_p_get_string(&slurmdbd_conf->log_file, "LogFile", tbl); if (!s_p_get_uint16(&slurmdbd_conf->msg_timeout, "MessageTimeout", tbl)) @@ -236,25 +248,34 @@ extern int read_slurmdbd_conf(void) xfree(temp_str); } + s_p_get_uint16(&slurmdbd_conf->purge_event, + "PurgeEventMonths", tbl); + s_p_get_uint16(&slurmdbd_conf->purge_job, + "PurgeJobMonths", tbl); + s_p_get_uint16(&slurmdbd_conf->purge_step, + "PurgeStepMonths", tbl); + s_p_get_uint16(&slurmdbd_conf->purge_suspend, + "PurgeSuspendMonths", tbl); + s_p_get_string(&slurmdbd_conf->slurm_user_name, "SlurmUser", tbl); - s_p_get_uint16(&slurmdbd_conf->step_purge, "StepPurge", tbl); + s_p_get_uint16(&slurmdbd_conf->purge_step, "StepPurge", tbl); s_p_get_string(&slurmdbd_conf->storage_backup_host, - "StorageBackupHost", tbl); + "StorageBackupHost", tbl); s_p_get_string(&slurmdbd_conf->storage_host, - "StorageHost", tbl); + "StorageHost", tbl); s_p_get_string(&slurmdbd_conf->storage_loc, - "StorageLoc", tbl); + "StorageLoc", tbl); s_p_get_string(&slurmdbd_conf->storage_pass, - "StoragePass", tbl); + "StoragePass", tbl); s_p_get_uint16(&slurmdbd_conf->storage_port, "StoragePort", tbl); s_p_get_string(&slurmdbd_conf->storage_type, "StorageType", tbl); s_p_get_string(&slurmdbd_conf->storage_user, - "StorageUser", tbl); - + "StorageUser", tbl); + if(!s_p_get_boolean((bool *)&slurmdbd_conf->track_wckey, "TrackWCKey", tbl)) slurmdbd_conf->track_wckey = false; @@ -329,7 +350,11 @@ extern void log_config(void) char tmp_str[128]; debug2("ArchiveDir = %s", slurmdbd_conf->archive_dir); + debug2("ArchiveEvents = %u", slurmdbd_conf->archive_events); + debug2("ArchiveJobs = %u", slurmdbd_conf->archive_jobs); debug2("ArchiveScript = %s", slurmdbd_conf->archive_script); + debug2("ArchiveSteps = %u", slurmdbd_conf->archive_steps); + debug2("ArchiveSuspend = %u", slurmdbd_conf->archive_suspend); debug2("AuthInfo = %s", slurmdbd_conf->auth_info); debug2("AuthType = %s", slurmdbd_conf->auth_type); debug2("DbdAddr = %s", slurmdbd_conf->dbd_addr); @@ -339,12 +364,6 @@ extern void log_config(void) debug2("DebugLevel = %u", slurmdbd_conf->debug_level); debug2("DefaultQOS = %s", slurmdbd_conf->default_qos); - if(slurmdbd_conf->job_purge) - debug2("JobPurge = %u months", - slurmdbd_conf->job_purge); - else - debug2("JobPurge = NONE"); - debug2("LogFile = %s", slurmdbd_conf->log_file); debug2("MessageTimeout = %u", slurmdbd_conf->msg_timeout); debug2("PidFile = %s", slurmdbd_conf->pid_file); @@ -354,15 +373,34 @@ extern void log_config(void) tmp_str, sizeof(tmp_str)); debug2("PrivateData = %s", tmp_str); - debug2("SlurmUser = %s(%u)", - slurmdbd_conf->slurm_user_name, slurmdbd_conf->slurm_user_id); - if(slurmdbd_conf->step_purge) - debug2("StepPurge = %u months", - slurmdbd_conf->step_purge); + if(slurmdbd_conf->purge_job) + debug2("PurgeJobMonths = %u months", + slurmdbd_conf->purge_job); + else + debug2("PurgeJobMonths = NONE"); + + if(slurmdbd_conf->purge_event) + debug2("PurgeEventMonths = %u months", + slurmdbd_conf->purge_event); + else + debug2("PurgeEventMonths = NONE"); + + if(slurmdbd_conf->purge_step) + debug2("PurgeStepMonths = %u months", + slurmdbd_conf->purge_step); + else + debug2("PurgeStepMonths = NONE"); + + if(slurmdbd_conf->purge_suspend) + debug2("PurgeSuspendMonths= %u months", + slurmdbd_conf->purge_suspend); else - debug2("StepPurge = NONE"); + debug2("PurgeSuspendMonths= NONE"); + debug2("SlurmUser = %s(%u)", + slurmdbd_conf->slurm_user_name, slurmdbd_conf->slurm_user_id); + debug2("StorageBackupHost = %s", slurmdbd_conf->storage_backup_host); debug2("StorageHost = %s", slurmdbd_conf->storage_host); debug2("StorageLoc = %s", slurmdbd_conf->storage_loc); @@ -370,6 +408,7 @@ extern void log_config(void) debug2("StoragePort = %u", slurmdbd_conf->storage_port); debug2("StorageType = %s", slurmdbd_conf->storage_type); debug2("StorageUser = %s", slurmdbd_conf->storage_user); + debug2("TrackWCKey = %u", slurmdbd_conf->track_wckey); } @@ -435,11 +474,35 @@ extern List dump_config(void) key_pair->value = xstrdup(slurmdbd_conf->archive_dir); list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("ArchiveEvents"); + key_pair->value = xmalloc(16); + snprintf(key_pair->value, 16, "%u", slurmdbd_conf->archive_events); + list_append(my_list, key_pair); + + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("ArchiveJobs"); + key_pair->value = xmalloc(16); + snprintf(key_pair->value, 16, "%u", slurmdbd_conf->archive_jobs); + list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("ArchiveScript"); key_pair->value = xstrdup(slurmdbd_conf->archive_script); list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("ArchiveSteps"); + key_pair->value = xmalloc(16); + snprintf(key_pair->value, 16, "%u", slurmdbd_conf->archive_steps); + list_append(my_list, key_pair); + + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("ArchiveSupend"); + key_pair->value = xmalloc(16); + snprintf(key_pair->value, 16, "%u", slurmdbd_conf->archive_suspend); + list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("AuthInfo"); key_pair->value = xstrdup(slurmdbd_conf->auth_info); @@ -488,16 +551,6 @@ extern List dump_config(void) key_pair->value = xstrdup(slurmdbd_conf->default_qos); list_append(my_list, key_pair); - key_pair = xmalloc(sizeof(config_key_pair_t)); - key_pair->name = xstrdup("JobPurge"); - if(slurmdbd_conf->job_purge) { - key_pair->value = xmalloc(32); - snprintf(key_pair->value, 32, "%u months", - slurmdbd_conf->job_purge); - } else - key_pair->value = xstrdup("NONE"); - list_append(my_list, key_pair); - key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("LogFile"); key_pair->value = xstrdup(slurmdbd_conf->log_file); @@ -526,6 +579,46 @@ extern List dump_config(void) key_pair->value, 128); list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("PurgeEventMonths"); + if(slurmdbd_conf->purge_event) { + key_pair->value = xmalloc(32); + snprintf(key_pair->value, 32, "%u months", + slurmdbd_conf->purge_event); + } else + key_pair->value = xstrdup("NONE"); + list_append(my_list, key_pair); + + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("PurgeJobMonths"); + if(slurmdbd_conf->purge_job) { + key_pair->value = xmalloc(32); + snprintf(key_pair->value, 32, "%u months", + slurmdbd_conf->purge_job); + } else + key_pair->value = xstrdup("NONE"); + list_append(my_list, key_pair); + + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("PurgeStepMonths"); + if(slurmdbd_conf->purge_step) { + key_pair->value = xmalloc(32); + snprintf(key_pair->value, 32, "%u months", + slurmdbd_conf->purge_step); + } else + key_pair->value = xstrdup("NONE"); + list_append(my_list, key_pair); + + key_pair = xmalloc(sizeof(config_key_pair_t)); + key_pair->name = xstrdup("PurgeSuspendMonths"); + if(slurmdbd_conf->purge_suspend) { + key_pair->value = xmalloc(32); + snprintf(key_pair->value, 32, "%u months", + slurmdbd_conf->purge_suspend); + } else + key_pair->value = xstrdup("NONE"); + list_append(my_list, key_pair); + key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("SLURMDBD_CONF"); key_pair->value = _get_conf_path(); @@ -536,7 +629,6 @@ extern List dump_config(void) key_pair->value = xstrdup(SLURM_VERSION); list_append(my_list, key_pair); - key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("SlurmUser"); key_pair->value = xmalloc(128); @@ -544,16 +636,6 @@ extern List dump_config(void) slurmdbd_conf->slurm_user_name, slurmdbd_conf->slurm_user_id); list_append(my_list, key_pair); - key_pair = xmalloc(sizeof(config_key_pair_t)); - key_pair->name = xstrdup("StepPurge"); - if(slurmdbd_conf->job_purge) { - key_pair->value = xmalloc(32); - snprintf(key_pair->value, 32, "%u months", - slurmdbd_conf->step_purge); - } else - key_pair->value = xstrdup("NONE"); - list_append(my_list, key_pair); - key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("StorageBackupHost"); key_pair->value = xstrdup(slurmdbd_conf->storage_backup_host); diff --git a/src/slurmdbd/read_config.h b/src/slurmdbd/read_config.h index ee15a7683313798b33533b0246703b79ef50f9de..9f198eb640f8bcaed4f51a95501ee93ba0ebb9e6 100644 --- a/src/slurmdbd/read_config.h +++ b/src/slurmdbd/read_config.h @@ -65,6 +65,8 @@ /* SlurmDBD configuration parameters */ typedef struct slurm_dbd_conf { time_t last_update; /* time slurmdbd.conf read */ + uint16_t archive_events; /* flag if we are to + * archive events */ uint16_t archive_jobs; /* flag if we are to * archive jobs */ char * archive_dir; /* location to localy @@ -73,8 +75,12 @@ typedef struct slurm_dbd_conf { char * archive_script; /* script to archive old data */ uint16_t archive_steps; /* flag if we are to * archive steps */ + uint16_t archive_suspend;/* flag if we are to + * archive suspend data */ char * auth_info; /* authentication info */ char * auth_type; /* authentication mechanism */ + uint16_t control_timeout;/* how long to wait before + * backup takes control */ char * dbd_addr; /* network address of Slurm DBD */ char * dbd_backup; /* hostname of Slurm DBD backup */ char * dbd_host; /* hostname of Slurm DBD */ @@ -82,17 +88,19 @@ typedef struct slurm_dbd_conf { uint16_t debug_level; /* Debug level, default=3 */ char * default_qos; /* default qos setting when * adding clusters */ - uint16_t job_purge; /* purge time for job info */ char * log_file; /* Log file */ uint16_t msg_timeout; /* message timeout */ char * pid_file; /* where to store current PID */ char * plugindir; /* dir to look for plugins */ uint16_t private_data; /* restrict information */ + uint16_t purge_event; /* purge events older than + * this in months */ + uint16_t purge_job; /* purge time for job info */ + uint16_t purge_step; /* purge time for step info */ + uint16_t purge_suspend; /* purge suspend data older than this + * in months */ uint32_t slurm_user_id; /* uid of slurm_user_name */ char * slurm_user_name;/* user that slurmcdtld runs as */ - uint16_t control_timeout;/* how long to wait before - * backup takes control */ - uint16_t step_purge; /* purge time for step info */ char * storage_backup_host;/* backup host where DB is * running */ char * storage_host; /* host where DB is running */