diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 895e0844ce1a71a03883c23315c60cbf84f47ea2..1293c3f7d11b005e6811da171e9058155effdad3 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -191,6 +191,10 @@ add users or sub accounts to the account they are coordinator over. This should be a trusted person since they can change limits on account and user associations inside their realm. +.TP +\fIevent\fR +Events like downed or draining nodes on clusters. + .TP \fIqos\fR Quality of Service. @@ -418,15 +422,15 @@ in the \fILIST/SHOW ASSOCIATION FORMAT OPTIONS\fP section. .SH "SPECIFICATIONS FOR ASSOCIATIONS" .TP -\fICluster\fP=<comma separated list of cluster names> +\fIClusters\fP=<comma separated list of cluster names> List the associations of the cluster(s). .TP -\fIAccount\fP=<comma separated list of account names> +\fIAccounts\fP=<comma separated list of account names> List the associations of the account(s). .TP -\fIUser\fP=<comma separated list of user names> +\fIUsers\fP=<comma separated list of user names> List the associations of the user(s). .TP @@ -659,6 +663,133 @@ account or list user. .RE +.SH "SPECIFICATIONS FOR EVENTS" + +.TP +\fIAll_Clusters\fP +Get information on all cluster shortcut. + +.TP +\fIAll_Time\fP +Get time period for all time shortcut. + +.TP +\fIClusters\fP=<comma separated list of cluster names> +List the events of the cluster(s). Default is the cluster where the +command was run. + +.TP +\fIEnd\fP=<OPT> +Period ending of events. Default is now. +Valid time formats are... +HH:MM[:SS] [AM|PM] +MMDD[YY] or MM/DD[/YY] or MM.DD[.YY] +MM/DD[/YY]-HH:MM[:SS] +YYYY-MM-DD[THH:MM[:SS]] + +.TP +\fIEvent\fP=<OPT> +Specific events to look for, valid options are Cluster or Node, +default is both. + +.TP +\fIMaxCPUs\fP=<OPT> +Max number of cpus affected by an event. + +.TP +\fIMinCPUs\fP=<OPT> +Min number of cpus affected by an event. + +.TP +\fINodes\fP=<comma separated list of node names> +Node names affected by an event. + +.TP +\fIReason\fP=<comma separated list of reasons> +Reason an event happened. + +.TP +\fIStart\fP=<OPT> +Period start of events. Default is 00:00:00 of previous day, unless +states are given with the States= spec events. If this is the case +the default behavior is to return events currently in +the states specified. + +Valid time formats are... +HH:MM[:SS] [AM|PM] +MMDD[YY] or MM/DD[/YY] or MM.DD[.YY] +MM/DD[/YY]-HH:MM[:SS] +YYYY-MM-DD[THH:MM[:SS]] + +.TP +\fIStates\fP=<comma separated list of states> +State of a node in a node event. If this is set, the event type is +set automatically to Node. + +.TP +\fIUser\fP=<comma separated list of users> +Query against users who set the event. If this is set, the event type is +set automatically to Node since only user slurm can preform a cluster event. + +.RE + +.SH "LIST/SHOW EVENT FORMAT OPTIONS" + +.TP +\fICluster\fP +The name of the cluster event happened on. + +.TP +\fIClusterNodes\fP +The hostlist of nodes on a cluster in a cluster event. + +.TP +\fICPUs\fP +Number of cpus involved with the event. + +.TP +\fIDuration\fP +Time period the event was around for. + +.TP +\fIEnd\fP +Period when event ended. + +.TP +\fIEvent\fP +Name of the event. + +.TP +\fIEventRaw\fP +Numeric value of the name of the event. + +.TP +\fINodeName\fP +The node effected by the event. In a cluster event, this is blank. + +.TP +\fIReason\fP +The reason an event happened. + +.TP +\fIStart\fP +Period when event started. + +.TP +\fIState\fP +On a node event this is the formatted state of the node during the event. + +.TP +\fIStateRaw\fP +On a node event this is the numeric value of the state of the node +during the event. + +.TP +\fIUser\fP +On a node event this is the user who caused the event to happen. +.RE + + .SH "SPECIFICATIONS FOR QOS" .TP diff --git a/src/plugins/accounting_storage/mysql/as_mysql_cluster.c b/src/plugins/accounting_storage/mysql/as_mysql_cluster.c index cbdb2cdccef378bee1127a7c0f1f8fca889e778d..7c71e37a9f34eac992d3bf59d038ef59e3d54341 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_cluster.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_cluster.c @@ -817,7 +817,7 @@ extern List as_mysql_get_cluster_events(mysql_conn_t *mysql_conn, uint32_t uid, else xstrcat(extra, " where ("); - xstrfmtcat(query, + xstrfmtcat(extra, "(time_start < %d) " "&& (time_end >= %d || time_end = 0))", event_cond->period_end, event_cond->period_start); @@ -904,7 +904,9 @@ empty: if(!(result = mysql_db_query_ret( mysql_conn->db_conn, query, 0))) { xfree(query); - return NULL; + list_destroy(ret_list); + ret_list = NULL; + break; } xfree(query); diff --git a/src/sacctmgr/event_functions.c b/src/sacctmgr/event_functions.c index 001177389958d4945656526af96ba41e73658040..6865ff7acc22c2a6f2d17582ac2256cbebb3ffe0 100644 --- a/src/sacctmgr/event_functions.c +++ b/src/sacctmgr/event_functions.c @@ -41,6 +41,7 @@ #include "src/sacctmgr/sacctmgr.h" #include "src/common/slurmdbd_defs.h" #include "src/common/uid.h" +#include <grp.h> static uint32_t _decode_node_state(char *val) { @@ -166,6 +167,114 @@ static int _addto_state_char_list(List char_list, char *names) return count; } +static char *_convert_to_id(char *name, bool gid) +{ + if(gid) { + struct group *grp; + if (!(grp=getgrnam(name))) { + fprintf(stderr, "Invalid group id: %s\n", name); + exit(1); + } + xfree(name); + name = xstrdup_printf("%d", grp->gr_gid); + } else { + struct passwd *pwd; + if (!(pwd=getpwnam(name))) { + fprintf(stderr, "Invalid user id: %s\n", name); + exit(1); + } + xfree(name); + name = xstrdup_printf("%d", pwd->pw_uid); + } + return name; +} + +/* returns number of objects added to list */ +static int _addto_id_char_list(List char_list, char *names, bool gid) +{ + int i=0, start=0; + char *name = NULL, *tmp_char = NULL; + ListIterator itr = NULL; + char quote_c = '\0'; + int quote = 0; + int count = 0; + + if(!char_list) { + error("No list was given to fill in"); + return 0; + } + + itr = list_iterator_create(char_list); + if(names) { + if (names[i] == '\"' || names[i] == '\'') { + quote_c = names[i]; + quote = 1; + i++; + } + start = i; + while(names[i]) { + //info("got %d - %d = %d", i, start, i-start); + if(quote && names[i] == quote_c) + break; + else if (names[i] == '\"' || names[i] == '\'') + names[i] = '`'; + else if(names[i] == ',') { + if((i-start) > 0) { + name = xmalloc((i-start+1)); + memcpy(name, names+start, (i-start)); + //info("got %s %d", name, i-start); + if (!isdigit((int) *name)) { + name = _convert_to_id( + name, gid); + } + + while((tmp_char = list_next(itr))) { + if(!strcasecmp(tmp_char, name)) + break; + } + + if(!tmp_char) { + list_append(char_list, name); + count++; + } else + xfree(name); + list_iterator_reset(itr); + } + i++; + start = i; + if(!names[i]) { + info("There is a problem with " + "your request. It appears you " + "have spaces inside your list."); + break; + } + } + i++; + } + if((i-start) > 0) { + name = xmalloc((i-start)+1); + memcpy(name, names+start, (i-start)); + + if (!isdigit((int) *name)) { + name = _convert_to_id(name, gid); + } + + while((tmp_char = list_next(itr))) { + if(!strcasecmp(tmp_char, name)) + break; + } + + if(!tmp_char) { + list_append(char_list, name); + count++; + } else + xfree(name); + } + } + list_iterator_destroy(itr); + return count; +} + static int _set_cond(int *start, int argc, char *argv[], slurmdb_event_cond_t *event_cond, List format_list) @@ -175,6 +284,7 @@ static int _set_cond(int *start, int argc, char *argv[], int command_len = 0; int option = 0; int local_cluster_flag = 0; + int all_time_flag = 0; if(!event_cond->cluster_list) event_cond->cluster_list = list_create(slurm_destroy_char); @@ -191,8 +301,11 @@ static int _set_cond(int *start, int argc, char *argv[], } if(!end && !strncasecmp(argv[i], "all_clusters", - MAX(command_len, 1))) { + MAX(command_len, 5))) { local_cluster_flag = 1; + } else if(!end && !strncasecmp(argv[i], "all_time", + MAX(command_len, 5))) { + all_time_flag = 1; } else if(!end && !strncasecmp(argv[i], "where", MAX(command_len, 5))) { continue; @@ -245,6 +358,13 @@ static int _set_cond(int *start, int argc, char *argv[], if(slurm_addto_char_list(event_cond->cluster_list, argv[i]+end)) set = 1; + } else if (!strncasecmp (argv[i], "End", MAX(command_len, 1))) { + event_cond->period_end = parse_time(argv[i]+end, 1); + set = 1; + } else if (!strncasecmp (argv[i], "Format", + MAX(command_len, 1))) { + if(format_list) + slurm_addto_char_list(format_list, argv[i]+end); } else if (!strncasecmp (argv[i], "MinCpus", MAX(command_len, 2))) { if (get_uint(argv[i]+end, &event_cond->cpus_min, @@ -263,13 +383,6 @@ static int _set_cond(int *start, int argc, char *argv[], if(slurm_addto_char_list(event_cond->node_list, argv[i]+end)) set = 1; - } else if (!strncasecmp (argv[i], "End", MAX(command_len, 1))) { - event_cond->period_end = parse_time(argv[i]+end, 1); - set = 1; - } else if (!strncasecmp (argv[i], "Format", - MAX(command_len, 1))) { - if(format_list) - slurm_addto_char_list(format_list, argv[i]+end); } else if (!strncasecmp (argv[i], "Reason", MAX(command_len, 1))) { if(!event_cond->reason_list) @@ -288,16 +401,20 @@ static int _set_cond(int *start, int argc, char *argv[], event_cond->state_list = list_create(slurm_destroy_char); if(_addto_state_char_list(event_cond->state_list, - argv[i]+end)) + argv[i]+end)) { + event_cond->event_type = SLURMDB_EVENT_NODE; set = 1; + } } else if (!strncasecmp (argv[i], "User", MAX(command_len, 1))) { if(!event_cond->reason_uid_list) event_cond->reason_uid_list = list_create(slurm_destroy_char); - if(slurm_addto_char_list(event_cond->reason_uid_list, - argv[i]+end)) + if(_addto_id_char_list(event_cond->reason_uid_list, + argv[i]+end, 0)) { + event_cond->event_type = SLURMDB_EVENT_NODE; set = 1; + } } else { exit_code=1; fprintf(stderr, " Unknown condition: %s\n", argv[i]); @@ -311,18 +428,26 @@ static int _set_cond(int *start, int argc, char *argv[], list_append(event_cond->cluster_list, temp); } - /* UNCOMMENT THIS IN IF WE EVER DECIDE TO LIMIT THE NUMBER OF EVENTS - RETURNED */ - - /* This needs to be done on some systems to make sure - assoc_cond isn't messed. This has happened on some 64 - bit machines and this is here to be on the safe side. - */ - /* start_time = event_cond->usage_start; */ - /* end_time = event_cond->usage_end; */ - /* slurmdb_report_set_start_end_time(&event_time, &event_time); */ - /* event_cond->usage_start = start_time; */ - /* event_cond->usage_end = end_time; */ + if(!all_time_flag && !event_cond->period_start) { + event_cond->period_start = time(NULL); + if(!event_cond->state_list) { + struct tm start_tm; + + if(!localtime_r(&event_cond->period_start, &start_tm)) { + fprintf(stderr, + " Couldn't get localtime from %ld", + event_cond->period_start); + exit_code=1; + return 0; + } + start_tm.tm_sec = 0; + start_tm.tm_min = 0; + start_tm.tm_hour = 0; + start_tm.tm_mday--; + start_tm.tm_isdst = -1; + event_cond->period_start = mktime(&start_tm); + } + } return set; } @@ -357,6 +482,7 @@ extern int sacctmgr_list_event(int argc, char *argv[]) PRINT_NODENAME, PRINT_START, PRINT_REASON, + PRINT_STATERAW, PRINT_STATE, PRINT_USER }; @@ -404,7 +530,7 @@ extern int sacctmgr_list_event(int argc, char *argv[]) field = xmalloc(sizeof(print_field_t)); if(!strncasecmp("ClusterNodes", object, MAX(command_len, 8))) { - field->type = PRINT_CLUSTER; + field->type = PRINT_CLUSTER_NODES; field->name = xstrdup("Cluster Nodes"); field->len = 20; field->print_routine = print_fields_str; @@ -424,7 +550,7 @@ extern int sacctmgr_list_event(int argc, char *argv[]) MAX(command_len, 2))) { field->type = PRINT_DURATION; field->name = xstrdup("Duration"); - field->len = 19; + field->len = 13; field->print_routine = print_fields_time_from_secs; } else if(!strncasecmp("End", object, MAX(command_len, 2))) { field->type = PRINT_END; @@ -447,7 +573,12 @@ extern int sacctmgr_list_event(int argc, char *argv[]) MAX(command_len, 1))) { field->type = PRINT_NODENAME; field->name = xstrdup("Node Name"); - field->len = 15; + field->len = -15; + field->print_routine = print_fields_str; + } else if(!strncasecmp("Reason", object, MAX(command_len, 1))) { + field->type = PRINT_REASON; + field->name = xstrdup("Reason"); + field->len = 30; field->print_routine = print_fields_str; } else if(!strncasecmp("Start", object, MAX(command_len, 1))) { @@ -455,11 +586,12 @@ extern int sacctmgr_list_event(int argc, char *argv[]) field->name = xstrdup("Start"); field->len = 19; field->print_routine = print_fields_date; - } else if(!strncasecmp("Reason", object, MAX(command_len, 1))) { - field->type = PRINT_REASON; - field->name = xstrdup("Reason"); - field->len = 20; - field->print_routine = print_fields_str; + } else if(!strncasecmp("StateRaw", object, + MAX(command_len, 6))) { + field->type = PRINT_STATERAW; + field->name = xstrdup("StateRaw"); + field->len = 8; + field->print_routine = print_fields_uint; } else if(!strncasecmp("State", object, MAX(command_len, 1))) { field->type = PRINT_STATE; field->name = xstrdup("State"); @@ -468,7 +600,7 @@ extern int sacctmgr_list_event(int argc, char *argv[]) } else if(!strncasecmp("User", object, MAX(command_len, 1))) { field->type = PRINT_USER; field->name = xstrdup("User"); - field->len = 10; + field->len = 15; field->print_routine = print_fields_str; } else { exit_code=1; @@ -571,6 +703,10 @@ extern int sacctmgr_list_event(int argc, char *argv[]) field->print_routine(field, event->reason, (curr_inx == field_count)); break; + case PRINT_STATERAW: + field->print_routine(field, event->state, + (curr_inx == field_count)); + break; case PRINT_STATE: if(event->event_type == SLURMDB_EVENT_CLUSTER) tmp_char = NULL; @@ -583,10 +719,14 @@ extern int sacctmgr_list_event(int argc, char *argv[]) (curr_inx == field_count)); break; case PRINT_USER: - tmp_char = uid_to_string(event->reason_uid); - snprintf(tmp, sizeof(tmp), "%s(%u)", - tmp_char, event->reason_uid); - xfree(tmp_char); + if(event->reason_uid != NO_VAL) { + tmp_char = uid_to_string( + event->reason_uid); + snprintf(tmp, sizeof(tmp), "%s(%u)", + tmp_char, event->reason_uid); + xfree(tmp_char); + } else + memset(tmp, 0, sizeof(tmp)); field->print_routine(field, tmp, (curr_inx == field_count)); break; diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index 82a34fe6da7aff727bedd16601dc27e5b6357ba8..d442a58e348c84675771c8c044931027075408d5 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -851,6 +851,10 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ add coordinator - Accounts=, and Names= \n\ delete coordinator - Accounts=, and Names= \n\ \n\ + list events - All_Clusters, All_Time, Clusters=, End=, Events=,\n\ + Format=, MaxCpus=, MinCpus=, Nodes=, Reason=, \n\ + Start=, States=, and User= \n\ + \n\ list qos - Descriptions=, Format=, Ids=, Names=, \n\ and WithDeleted \n\ add qos - Description=, GrpCPUMins=, GrpCPUs=, GrpJobs=, \n\ @@ -916,6 +920,10 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ MaxCPUs, MaxJobs, MaxNodes, MaxSubmitJobs, \n\ MaxWall, NodeCount, NodeNames \n\ \n\ + Event - Cluster, ClusterNodes, CPUs, Duration, End, \n\ + Event, EventRaw, NodeName, Reason, Start, \n\ + State, StateRaw, User \n\ + \n\ QOS - Description, GrpCPUMins, GrpCPUs, GrpJobs, \n\ GrpNodes, GrpSubmitJob, GrpWall, ID, \n\ MaxCPUMins, MaxCPUs, MaxJobs, MaxNodes, \n\