From d9640972940118f21b1823c116768c83e70da97a Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Fri, 8 Jul 2016 13:54:25 -0700 Subject: [PATCH] Rename LostJobs RunawayJobs in sacctmgr. --- doc/man/man1/sacctmgr.1 | 7 ++- src/common/slurm_accounting_storage.c | 9 ++-- src/common/slurm_accounting_storage.h | 7 +-- src/common/slurmdbd_defs.c | 6 +-- src/common/slurmdbd_defs.h | 2 +- .../filetxt/accounting_storage_filetxt.c | 4 +- .../accounting_storage/mysql/Makefile.am | 2 +- .../accounting_storage/mysql/Makefile.in | 20 +++---- .../mysql/accounting_storage_mysql.c | 6 +-- ...ost_jobs.c => as_mysql_fix_runaway_jobs.c} | 20 +++---- ...ost_jobs.h => as_mysql_fix_runaway_jobs.h} | 10 ++-- .../none/accounting_storage_none.c | 4 +- .../slurmdbd/accounting_storage_slurmdbd.c | 6 +-- src/sacctmgr/Makefile.am | 2 +- src/sacctmgr/Makefile.in | 6 +-- ...bs_functions.c => runaway_job_functions.c} | 53 ++++++++++--------- src/sacctmgr/sacctmgr.c | 15 ++++-- src/sacctmgr/sacctmgr.h | 4 +- src/slurmdbd/proc_req.c | 24 ++++----- 19 files changed, 109 insertions(+), 98 deletions(-) rename src/plugins/accounting_storage/mysql/{as_mysql_fix_lost_jobs.c => as_mysql_fix_runaway_jobs.c} (91%) rename src/plugins/accounting_storage/mysql/{as_mysql_fix_lost_jobs.h => as_mysql_fix_runaway_jobs.h} (88%) rename src/sacctmgr/{lost_jobs_functions.c => runaway_job_functions.c} (84%) diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 2a708a643bd..543ad1ce936 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -208,8 +208,11 @@ account and user associations inside their realm. Events like downed or draining nodes on clusters. .TP -\fILostJobs\fR -Jobs that have been orphanded on the local cluster. +\fIRunawayJobs\fR +Used only with the \fIlist\fR or \fIshow\fR command to report current +jobs that have been orphanded on the local cluster and are now +runaway. If there are jobs in this state it will also give you an +option to "fix" them. .TP \fIjob\fR diff --git a/src/common/slurm_accounting_storage.c b/src/common/slurm_accounting_storage.c index 9387862020a..b3e6d25a8e8 100644 --- a/src/common/slurm_accounting_storage.c +++ b/src/common/slurm_accounting_storage.c @@ -170,7 +170,7 @@ typedef struct slurm_acct_storage_ops { int (*roll_usage) (void *db_conn, time_t sent_start, time_t sent_end, uint16_t archive_data); - int (*fix_lost_jobs) (void *db_conn, uint32_t uid, List jobs); + int (*fix_runaway_jobs) (void *db_conn, uint32_t uid, List jobs); int (*node_down) (void *db_conn, struct node_record *node_ptr, time_t event_time, @@ -257,7 +257,7 @@ static const char *syms[] = { "acct_storage_p_get_txn", "acct_storage_p_get_usage", "acct_storage_p_roll_usage", - "acct_storage_p_fix_lost_jobs", + "acct_storage_p_fix_runaway_jobs", "clusteracct_storage_p_node_down", "clusteracct_storage_p_node_up", "clusteracct_storage_p_cluster_tres", @@ -733,11 +733,12 @@ extern int acct_storage_g_roll_usage(void *db_conn, return (*(ops.roll_usage))(db_conn, sent_start, sent_end, archive_data); } -extern int acct_storage_g_fix_lost_jobs(void *db_conn, uint32_t uid, List jobs) +extern int acct_storage_g_fix_runaway_jobs(void *db_conn, + uint32_t uid, List jobs) { if (slurm_acct_storage_init(NULL) < 0) return SLURM_ERROR; - return (*(ops.fix_lost_jobs))(db_conn, uid, jobs); + return (*(ops.fix_runaway_jobs))(db_conn, uid, jobs); } diff --git a/src/common/slurm_accounting_storage.h b/src/common/slurm_accounting_storage.h index f968bca6140..6e6d1390f1d 100644 --- a/src/common/slurm_accounting_storage.h +++ b/src/common/slurm_accounting_storage.h @@ -472,11 +472,12 @@ extern int acct_storage_g_roll_usage(void *db_conn, uint16_t archive_data); /* - * Fix lost jobs - * IN: jobs, a list of all the lost jobs + * Fix runaway jobs + * IN: jobs, a list of all the runaway jobs * RET: SLURM_SUCCESS on success SLURM_ERROR else */ -extern int acct_storage_g_fix_lost_jobs(void *db_conn, uint32_t uid, List jobs); +extern int acct_storage_g_fix_runaway_jobs(void *db_conn, + uint32_t uid, List jobs); /* * record shares used information for backup in case slurmctld restarts diff --git a/src/common/slurmdbd_defs.c b/src/common/slurmdbd_defs.c index da48ed0e65d..441e57adcf9 100644 --- a/src/common/slurmdbd_defs.c +++ b/src/common/slurmdbd_defs.c @@ -521,7 +521,7 @@ extern Buf pack_slurmdbd_msg(slurmdbd_msg_t *req, uint16_t rpc_version) case DBD_GOT_MULT_JOB_START: case DBD_SEND_MULT_MSG: case DBD_GOT_MULT_MSG: - case DBD_FIX_LOST_JOB: + case DBD_FIX_RUNAWAY_JOB: slurmdbd_pack_list_msg( (dbd_list_msg_t *)req->data, rpc_version, req->msg_type, buffer); @@ -3671,7 +3671,7 @@ extern void slurmdbd_pack_list_msg(dbd_list_msg_t *msg, my_function = pack_config_key_pair; break; case DBD_GOT_JOBS: - case DBD_FIX_LOST_JOB: + case DBD_FIX_RUNAWAY_JOB: my_function = slurmdb_pack_job_rec; break; case DBD_GOT_LIST: @@ -3773,7 +3773,7 @@ extern int slurmdbd_unpack_list_msg(dbd_list_msg_t **msg, uint16_t rpc_version, my_destroy = destroy_config_key_pair; break; case DBD_GOT_JOBS: - case DBD_FIX_LOST_JOB: + case DBD_FIX_RUNAWAY_JOB: my_function = slurmdb_unpack_job_rec; my_destroy = slurmdb_destroy_job_rec; break; diff --git a/src/common/slurmdbd_defs.h b/src/common/slurmdbd_defs.h index a177d60d849..65fb720f227 100644 --- a/src/common/slurmdbd_defs.h +++ b/src/common/slurmdbd_defs.h @@ -156,7 +156,7 @@ typedef enum { DBD_ADD_TRES, /* Add tres to the database */ DBD_GET_TRES, /* Get tres from the database */ DBD_GOT_TRES, /* Got tres from the database */ - DBD_FIX_LOST_JOB, /* Fix the lost jobs */ + DBD_FIX_RUNAWAY_JOB, /* Fix any runaway jobs */ } slurmdbd_msg_type_t; /*****************************************************************************\ diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c index 49c9478fabe..d7db21aaaaf 100644 --- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c +++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c @@ -547,8 +547,8 @@ extern int acct_storage_p_roll_usage(void *db_conn, return rc; } -extern int acct_storage_p_fix_lost_jobs(void *db_conn, uint32_t uid, - List jobs) +extern int acct_storage_p_fix_runaway_jobs(void *db_conn, uint32_t uid, + List jobs) { return SLURM_SUCCESS; } diff --git a/src/plugins/accounting_storage/mysql/Makefile.am b/src/plugins/accounting_storage/mysql/Makefile.am index 178459e25a1..6ebd26e78eb 100644 --- a/src/plugins/accounting_storage/mysql/Makefile.am +++ b/src/plugins/accounting_storage/mysql/Makefile.am @@ -13,7 +13,7 @@ AS_MYSQL_SOURCES = accounting_storage_mysql.c accounting_storage_mysql.h \ as_mysql_assoc.c as_mysql_assoc.h \ as_mysql_cluster.c as_mysql_cluster.h \ as_mysql_convert.c as_mysql_convert.h \ - as_mysql_fix_lost_jobs.c as_mysql_fix_lost_jobs.h \ + as_mysql_fix_runaway_jobs.c as_mysql_fix_runaway_jobs.h \ as_mysql_job.c as_mysql_job.h \ as_mysql_jobacct_process.c as_mysql_jobacct_process.h \ as_mysql_problems.c as_mysql_problems.h \ diff --git a/src/plugins/accounting_storage/mysql/Makefile.in b/src/plugins/accounting_storage/mysql/Makefile.in index 98427145a35..018dad24784 100644 --- a/src/plugins/accounting_storage/mysql/Makefile.in +++ b/src/plugins/accounting_storage/mysql/Makefile.in @@ -183,7 +183,7 @@ am__accounting_storage_mysql_la_SOURCES_DIST = \ as_mysql_tres.h as_mysql_archive.c as_mysql_archive.h \ as_mysql_assoc.c as_mysql_assoc.h as_mysql_cluster.c \ as_mysql_cluster.h as_mysql_convert.c as_mysql_convert.h \ - as_mysql_fix_lost_jobs.c as_mysql_fix_lost_jobs.h \ + as_mysql_fix_runaway_jobs.c as_mysql_fix_runaway_jobs.h \ as_mysql_job.c as_mysql_job.h as_mysql_jobacct_process.c \ as_mysql_jobacct_process.h as_mysql_problems.c \ as_mysql_problems.h as_mysql_qos.c as_mysql_qos.h \ @@ -200,7 +200,7 @@ am__objects_1 = \ accounting_storage_mysql_la-as_mysql_assoc.lo \ accounting_storage_mysql_la-as_mysql_cluster.lo \ accounting_storage_mysql_la-as_mysql_convert.lo \ - accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo \ + accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo \ accounting_storage_mysql_la-as_mysql_job.lo \ accounting_storage_mysql_la-as_mysql_jobacct_process.lo \ accounting_storage_mysql_la-as_mysql_problems.lo \ @@ -220,7 +220,7 @@ am__EXTRA_accounting_storage_mysql_la_SOURCES_DIST = \ as_mysql_tres.h as_mysql_archive.c as_mysql_archive.h \ as_mysql_assoc.c as_mysql_assoc.h as_mysql_cluster.c \ as_mysql_cluster.h as_mysql_convert.c as_mysql_convert.h \ - as_mysql_fix_lost_jobs.c as_mysql_fix_lost_jobs.h \ + as_mysql_fix_runaway_jobs.c as_mysql_fix_runaway_jobs.h \ as_mysql_job.c as_mysql_job.h as_mysql_jobacct_process.c \ as_mysql_jobacct_process.h as_mysql_problems.c \ as_mysql_problems.h as_mysql_qos.c as_mysql_qos.h \ @@ -572,7 +572,7 @@ AS_MYSQL_SOURCES = accounting_storage_mysql.c accounting_storage_mysql.h \ as_mysql_assoc.c as_mysql_assoc.h \ as_mysql_cluster.c as_mysql_cluster.h \ as_mysql_convert.c as_mysql_convert.h \ - as_mysql_fix_lost_jobs.c as_mysql_fix_lost_jobs.h \ + as_mysql_fix_runaway_jobs.c as_mysql_fix_runaway_jobs.h \ as_mysql_job.c as_mysql_job.h \ as_mysql_jobacct_process.c as_mysql_jobacct_process.h \ as_mysql_problems.c as_mysql_problems.h \ @@ -680,7 +680,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_assoc.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_cluster.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_convert.Plo@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_lost_jobs.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_job.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_jobacct_process.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/accounting_storage_mysql_la-as_mysql_problems.Plo@am__quote@ @@ -764,12 +764,12 @@ accounting_storage_mysql_la-as_mysql_convert.lo: as_mysql_convert.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -c -o accounting_storage_mysql_la-as_mysql_convert.lo `test -f 'as_mysql_convert.c' || echo '$(srcdir)/'`as_mysql_convert.c -accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo: as_mysql_fix_lost_jobs.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -MT accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo -MD -MP -MF $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_lost_jobs.Tpo -c -o accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo `test -f 'as_mysql_fix_lost_jobs.c' || echo '$(srcdir)/'`as_mysql_fix_lost_jobs.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_lost_jobs.Tpo $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_lost_jobs.Plo -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='as_mysql_fix_lost_jobs.c' object='accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo' libtool=yes @AMDEPBACKSLASH@ +accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo: as_mysql_fix_runaway_jobs.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -MT accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo -MD -MP -MF $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.Tpo -c -o accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo `test -f 'as_mysql_fix_runaway_jobs.c' || echo '$(srcdir)/'`as_mysql_fix_runaway_jobs.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.Tpo $(DEPDIR)/accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='as_mysql_fix_runaway_jobs.c' object='accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -c -o accounting_storage_mysql_la-as_mysql_fix_lost_jobs.lo `test -f 'as_mysql_fix_lost_jobs.c' || echo '$(srcdir)/'`as_mysql_fix_lost_jobs.c +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -c -o accounting_storage_mysql_la-as_mysql_fix_runaway_jobs.lo `test -f 'as_mysql_fix_runaway_jobs.c' || echo '$(srcdir)/'`as_mysql_fix_runaway_jobs.c accounting_storage_mysql_la-as_mysql_job.lo: as_mysql_job.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(accounting_storage_mysql_la_CFLAGS) $(CFLAGS) -MT accounting_storage_mysql_la-as_mysql_job.lo -MD -MP -MF $(DEPDIR)/accounting_storage_mysql_la-as_mysql_job.Tpo -c -o accounting_storage_mysql_la-as_mysql_job.lo `test -f 'as_mysql_job.c' || echo '$(srcdir)/'`as_mysql_job.c diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index 4c6774cde92..e039eb98b72 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -51,7 +51,7 @@ #include "as_mysql_assoc.h" #include "as_mysql_cluster.h" #include "as_mysql_convert.h" -#include "as_mysql_fix_lost_jobs.h" +#include "as_mysql_fix_runaway_jobs.h" #include "as_mysql_job.h" #include "as_mysql_jobacct_process.h" #include "as_mysql_problems.h" @@ -2875,10 +2875,10 @@ extern int acct_storage_p_roll_usage(mysql_conn_t *mysql_conn, sent_end, archive_data); } -extern int acct_storage_p_fix_lost_jobs(void *db_conn, uint32_t uid, +extern int acct_storage_p_fix_runaway_jobs(void *db_conn, uint32_t uid, List jobs) { - return as_mysql_fix_lost_jobs(db_conn, uid, jobs); + return as_mysql_fix_runaway_jobs(db_conn, uid, jobs); } extern int clusteracct_storage_p_node_down(mysql_conn_t *mysql_conn, diff --git a/src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.c b/src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.c similarity index 91% rename from src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.c rename to src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.c index b07dfc56fab..6e06aed3a31 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * as_mysql_fix_lost_jobs.c - functions dealing with lost jobs. + * as_mysql_fix_runaway_jobs.c - functions dealing with runaway jobs. ***************************************************************************** * Copyright (C) 2016 SchedMD LLC. * Written by Nathan Yee <nyee32@schedmd.com> @@ -34,7 +34,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ -#include "as_mysql_fix_lost_jobs.h" +#include "as_mysql_fix_runaway_jobs.h" #include "src/common/list.h" static int _job_sort_by_start_time(void *void1, void * void2) @@ -87,8 +87,8 @@ static int _first_job_roll_up(mysql_conn_t *mysql_conn, time_t first_start) return rc; } -extern int as_mysql_fix_lost_jobs(mysql_conn_t *mysql_conn, uint32_t uid, - List lost_jobs) +extern int as_mysql_fix_runaway_jobs(mysql_conn_t *mysql_conn, uint32_t uid, + List runaway_jobs) { char *query = NULL, *job_ids = NULL; slurmdb_job_rec_t *job = NULL; @@ -96,8 +96,8 @@ extern int as_mysql_fix_lost_jobs(mysql_conn_t *mysql_conn, uint32_t uid, int rc = SLURM_SUCCESS; slurmdb_job_rec_t *first_job; - list_sort(lost_jobs, _job_sort_by_start_time); - first_job = list_peek(lost_jobs); + list_sort(runaway_jobs, _job_sort_by_start_time); + first_job = list_peek(runaway_jobs); if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; @@ -110,12 +110,12 @@ extern int as_mysql_fix_lost_jobs(mysql_conn_t *mysql_conn, uint32_t uid, if (!is_user_any_coord(mysql_conn, &user)) { error("Only admins/operators/coordinators " - "can fix lost jobs"); + "can fix runaway jobs"); return ESLURM_ACCESS_DENIED; } } - iter = list_iterator_create(lost_jobs); + iter = list_iterator_create(runaway_jobs); while ((job = list_next(iter))) { xstrfmtcat(job_ids, "%s%d", ((job_ids) ? "," : ""), job->jobid); } @@ -132,10 +132,10 @@ extern int as_mysql_fix_lost_jobs(mysql_conn_t *mysql_conn, uint32_t uid, xfree(job_ids); /* Set rollup to the the last day of the previous month of the first - * lost job */ + * runaway job */ rc = _first_job_roll_up(mysql_conn, first_job->start); if (rc != SLURM_SUCCESS) { - error("Failed to fix lost jobs"); + error("Failed to fix runaway jobs"); return SLURM_ERROR; } diff --git a/src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.h b/src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.h similarity index 88% rename from src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.h rename to src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.h index fba0103a3b5..43cc665b60a 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_fix_lost_jobs.h +++ b/src/plugins/accounting_storage/mysql/as_mysql_fix_runaway_jobs.h @@ -1,5 +1,5 @@ /*****************************************************************************\ - * as_mysql_fix_lost_jobs.h - functions dealing with lost jobs. + * as_mysql_fix_runaway_jobs.h - functions dealing with runaway jobs. ***************************************************************************** * Copyright (C) 2016 SchedMD LLC. * Written by Nathan Yee <nyee32@schedmd.com> @@ -34,13 +34,13 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ -#ifndef _HAVE_MYSQL_FIX_LOST_JOBS_H -#define _HAVE_MYSQL_FIX_LOST_JOBS_H +#ifndef _HAVE_MYSQL_FIX_RUNAWAY_JOBS_H +#define _HAVE_MYSQL_FIX_RUNAWAY_JOBS_H #include "accounting_storage_mysql.h" #include "src/common/slurm_time.h" -extern int as_mysql_fix_lost_jobs(mysql_conn_t *mysql_conn, uint32_t uid, - List lost_jobs); +extern int as_mysql_fix_runaway_jobs(mysql_conn_t *mysql_conn, uint32_t uid, + List runaway_jobs); #endif diff --git a/src/plugins/accounting_storage/none/accounting_storage_none.c b/src/plugins/accounting_storage/none/accounting_storage_none.c index 2bd1b7b8cac..b5f3f7952e0 100644 --- a/src/plugins/accounting_storage/none/accounting_storage_none.c +++ b/src/plugins/accounting_storage/none/accounting_storage_none.c @@ -371,8 +371,8 @@ extern int acct_storage_p_roll_usage(void *db_conn, return rc; } -extern int acct_storage_p_fix_lost_jobs(void *db_conn, uint32_t uid, - List jobs) +extern int acct_storage_p_fix_runaway_jobs(void *db_conn, uint32_t uid, + List jobs) { return SLURM_SUCCESS; } diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index fdad804484f..2d40dec10e3 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -2244,8 +2244,8 @@ extern int acct_storage_p_roll_usage(void *db_conn, return rc; } -extern int acct_storage_p_fix_lost_jobs(void *db_conn, uint32_t uid, - List jobs) +extern int acct_storage_p_fix_runaway_jobs(void *db_conn, uint32_t uid, + List jobs) { slurmdbd_msg_t req; dbd_list_msg_t get_msg; @@ -2254,7 +2254,7 @@ extern int acct_storage_p_fix_lost_jobs(void *db_conn, uint32_t uid, memset(&get_msg, 0, sizeof(dbd_list_msg_t)); get_msg.my_list = jobs; - req.msg_type = DBD_FIX_LOST_JOB; + req.msg_type = DBD_FIX_RUNAWAY_JOB; req.data = &get_msg; rc = slurm_send_slurmdbd_recv_rc_msg(SLURM_PROTOCOL_VERSION, diff --git a/src/sacctmgr/Makefile.am b/src/sacctmgr/Makefile.am index 564f28ce333..80754dc36f5 100644 --- a/src/sacctmgr/Makefile.am +++ b/src/sacctmgr/Makefile.am @@ -19,7 +19,7 @@ sacctmgr_SOURCES = \ common.c \ event_functions.c \ file_functions.c \ - lost_jobs_functions.c \ + runaway_job_functions.c \ job_functions.c \ reservation_functions.c \ resource_functions.c \ diff --git a/src/sacctmgr/Makefile.in b/src/sacctmgr/Makefile.in index 7647f4d27a1..04a010e2592 100644 --- a/src/sacctmgr/Makefile.in +++ b/src/sacctmgr/Makefile.in @@ -151,7 +151,7 @@ am_sacctmgr_OBJECTS = account_functions.$(OBJEXT) \ archive_functions.$(OBJEXT) association_functions.$(OBJEXT) \ config_functions.$(OBJEXT) cluster_functions.$(OBJEXT) \ common.$(OBJEXT) event_functions.$(OBJEXT) \ - file_functions.$(OBJEXT) lost_jobs_functions.$(OBJEXT) \ + file_functions.$(OBJEXT) runaway_job_functions.$(OBJEXT) \ job_functions.$(OBJEXT) reservation_functions.$(OBJEXT) \ resource_functions.$(OBJEXT) sacctmgr.$(OBJEXT) \ qos_functions.$(OBJEXT) txn_functions.$(OBJEXT) \ @@ -502,7 +502,7 @@ sacctmgr_SOURCES = \ common.c \ event_functions.c \ file_functions.c \ - lost_jobs_functions.c \ + runaway_job_functions.c\ job_functions.c \ reservation_functions.c \ resource_functions.c \ @@ -617,7 +617,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/event_functions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/file_functions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_functions.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lost_jobs_functions.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/runaway_job_functions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/problem_functions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/qos_functions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reservation_functions.Po@am__quote@ diff --git a/src/sacctmgr/lost_jobs_functions.c b/src/sacctmgr/runaway_job_functions.c similarity index 84% rename from src/sacctmgr/lost_jobs_functions.c rename to src/sacctmgr/runaway_job_functions.c index c3ab13da32a..f680077b194 100644 --- a/src/sacctmgr/lost_jobs_functions.c +++ b/src/sacctmgr/runaway_job_functions.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * lost_jobs_functions.c - functions dealing with lost jobs + * runaway_jobs_functions.c - functions dealing with runaway/orphan jobs ***************************************************************************** * Copyright (C) 2016 SchedMD LLC. * Written by Nathan Yee <nyee32@schedmd.com> @@ -49,7 +49,7 @@ static int _job_sort_by_start_time(void *void1, void * void2) return 0; } -static void _print_lost_jobs(List jobs) +static void _print_runaway_jobs(List jobs) { char outbuf[FORMAT_STRING_SIZE]; slurmdb_job_rec_t *job = NULL; @@ -131,17 +131,17 @@ static void _print_lost_jobs(List jobs) list_iterator_destroy(itr); } -static List _get_lost_jobs(char *cluster) +static List _get_runaway_jobs(char *cluster) { int i = 0; - bool job_lost = true; + bool job_runaway = true; List db_jobs_list = NULL; ListIterator db_jobs_itr = NULL; job_info_t *clus_job = NULL; job_info_msg_t *clus_jobs = NULL; slurmdb_job_rec_t *db_job = NULL; slurmdb_job_cond_t *job_cond = xmalloc(sizeof(slurmdb_job_cond_t)); - List lost_jobs = NULL; + List runaway_jobs = NULL; job_cond->without_steps = 1; job_cond->without_usage_truncation = 1; @@ -158,62 +158,63 @@ static List _get_lost_jobs(char *cluster) return NULL; } - lost_jobs = list_create(NULL); + runaway_jobs = list_create(NULL); db_jobs_itr = list_iterator_create(db_jobs_list); while ((db_job = list_next(db_jobs_itr))) { - job_lost = true; + job_runaway = true; for (i = 0, clus_job = clus_jobs->job_array; i < clus_jobs->record_count; i++, clus_job++) { if (db_job->jobid == clus_job->job_id) { - job_lost = false; + job_runaway = false; break; } } - if (job_lost) - list_append(lost_jobs, db_job); + if (job_runaway) + list_append(runaway_jobs, db_job); } list_iterator_destroy(db_jobs_itr); - return lost_jobs; + return runaway_jobs; } -static void _report_lost_jobs(List lost_jobs) +static void _report_runaway_jobs(List runaway_jobs) { - if (list_count(lost_jobs)) { - printf("NOTE: Lost jobs are jobs that don't exist in the " + if (list_count(runaway_jobs)) { + printf("NOTE: Runaway jobs are jobs that don't exist in the " "controller but are still considered running in the " "datbase\n"); - _print_lost_jobs(lost_jobs); + _print_runaway_jobs(runaway_jobs); } } /* - * List and ask user if they wish to fix the lost jobs + * List and ask user if they wish to fix the runaway jobs */ -extern int sacctmgr_list_lost_jobs(int argc, char *argv[]) +extern int sacctmgr_list_runaway_jobs(int argc, char *argv[]) { - List lost_jobs = NULL; + List runaway_jobs = NULL; int rc = SLURM_SUCCESS; uint32_t my_uid = getuid(); char *cluster = NULL; - char *ask_msg = "\nWould you like to fix these lost jobs?\n" + char *ask_msg = "\nWould you like to fix these runaway jobs?\n" "(This will set the end times to start times and " "states to completed for these jobs and will trigger " "the rollup to reroll usage from before the oldest " - "lost job.)\n\n"; + "runaway job.)\n\n"; if (!(cluster = slurm_get_cluster_name())) return SLURM_ERROR; - if (!(lost_jobs = _get_lost_jobs(cluster))) + if (!(runaway_jobs = _get_runaway_jobs(cluster))) return SLURM_ERROR; xfree(cluster); - _report_lost_jobs(lost_jobs); + _report_runaway_jobs(runaway_jobs); - if (list_count(lost_jobs)) { - rc = acct_storage_g_fix_lost_jobs(db_conn, my_uid, lost_jobs); + if (list_count(runaway_jobs)) { + rc = acct_storage_g_fix_runaway_jobs( + db_conn, my_uid, runaway_jobs); if (rc == SLURM_SUCCESS) { if (commit_check(ask_msg)) { @@ -224,12 +225,12 @@ extern int sacctmgr_list_lost_jobs(int argc, char *argv[]) } } else { - error("Failed to fix lost job: %s\n", + error("Failed to fix runaway job: %s\n", slurm_strerror(rc)); } } else { - printf("Lost Jobs: No lost jobs found\n"); + printf("Runaway Jobs: No runaway jobs found\n"); } return rc; diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index dd28ece3424..a8d21564aaf 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -651,8 +651,10 @@ static void _show_it (int argc, char *argv[]) } else if (strncasecmp(argv[0], "Problems", MAX(command_len, 1)) == 0) { error_code = sacctmgr_list_problem((argc - 1), &argv[1]); - } else if (strncasecmp(argv[0], "LostJobs", MAX(command_len, 1)) == 0) { - error_code = sacctmgr_list_lost_jobs((argc - 1), &argv[1]); + } else if (!strncasecmp(argv[0], "RunawayJobs", MAX(command_len, 2)) || + !strncasecmp(argv[0], "OrphanJobs", MAX(command_len, 1)) || + !strncasecmp(argv[0], "LostJobs", MAX(command_len, 1))) { + error_code = sacctmgr_list_runaway_jobs((argc - 1), &argv[1]); } else if (strncasecmp(argv[0], "QOS", MAX(command_len, 1)) == 0) { error_code = sacctmgr_list_qos((argc - 1), &argv[1]); } else if (!strncasecmp(argv[0], "Resource", MAX(command_len, 4))) { @@ -677,7 +679,8 @@ static void _show_it (int argc, char *argv[]) fprintf(stderr, "\"Account\", \"Association\", " "\"Cluster\", \"Configuration\",\n\"Event\", " "\"Problem\", \"QOS\", \"Resource\", \"Reservation\", " - "\"Transaction\", \"TRES\", \"User\", or \"WCKey\"\n"); + "\"RunAwayJobs\", \"Transaction\", \"TRES\", " + "\"User\", or \"WCKey\"\n"); } if (error_code != SLURM_SUCCESS) { @@ -855,8 +858,8 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ <ENTITY> may be \"account\", \"association\", \"cluster\", \n\ \"configuration\", \"coordinator\", \"event\", \"job\", \n\ \"problem\", \"qos\", \"resource\", \"reservation\", \n\ - \"transaction\", \"tres\", \n\ - \"user\" or \"wckey\" \n\ + \"runawayjobs\", \"transaction\", \"tres\", \n\ + \"user\" or \"wckey\" \n\ \n\ <SPECS> are different for each command entity pair. \n\ list account - Clusters=, Descriptions=, Format=, \n\ @@ -944,6 +947,8 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ \n\ list reservation - Clusters=, End=, ID=, Names=, Nodes=, Start= \n\ \n\ + list runawayjobs \n\ + \n\ list transactions - Accounts=, Action=, Actor=, Clusters=, End=, \n\ Format=, ID=, Start=, User=, and WithAssoc \n\ \n\ diff --git a/src/sacctmgr/sacctmgr.h b/src/sacctmgr/sacctmgr.h index c01da83e733..ab4e2c80bca 100644 --- a/src/sacctmgr/sacctmgr.h +++ b/src/sacctmgr/sacctmgr.h @@ -358,7 +358,7 @@ extern void load_sacctmgr_cfg_file (int argc, char *argv[]); /* txn_functions.c */ extern int sacctmgr_list_txn(int argc, char *argv[]); -/* lost_jobs_functions.c */ -extern int sacctmgr_list_lost_jobs(int argc, char *argv[]); +/* runaway_jobs_functions.c */ +extern int sacctmgr_list_runaway_jobs(int argc, char *argv[]); #endif diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index bfa7d020eed..535c309da31 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -180,8 +180,8 @@ static int _step_complete(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); static int _step_start(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, Buf *out_buffer, uint32_t *uid); -static int _fix_lost_jobs(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, - Buf *out_buffer, uint32_t *uid); +static int _fix_runaway_jobs(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, + Buf *out_buffer, uint32_t *uid); /* Process an incoming RPC * slurmdbd_conn IN/OUT - in will that the newsockfd set before @@ -466,9 +466,9 @@ proc_req(slurmdbd_conn_t *slurmdbd_conn, rc = _step_start(slurmdbd_conn, in_buffer, out_buffer, uid); break; - case DBD_FIX_LOST_JOB: - rc = _fix_lost_jobs(slurmdbd_conn, - in_buffer, out_buffer, uid); + case DBD_FIX_RUNAWAY_JOB: + rc = _fix_runaway_jobs(slurmdbd_conn, + in_buffer, out_buffer, uid); break; default: comment = "Invalid RPC"; @@ -569,29 +569,29 @@ end_it: return rc; } -static int _fix_lost_jobs(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, - Buf *out_buffer, uint32_t *uid) +static int _fix_runaway_jobs(slurmdbd_conn_t *slurmdbd_conn, Buf in_buffer, + Buf *out_buffer, uint32_t *uid) { int rc = SLURM_SUCCESS; dbd_list_msg_t *get_msg = NULL; char *comment = NULL; if (slurmdbd_unpack_list_msg(&get_msg, slurmdbd_conn->rpc_version, - DBD_FIX_LOST_JOB, in_buffer) != + DBD_FIX_RUNAWAY_JOB, in_buffer) != SLURM_SUCCESS) { - comment = "Failed to unpack DBD_LOST_JOBS message"; + comment = "Failed to unpack DBD_RUNAWAY_JOBS message"; error("CONN:%u %s", slurmdbd_conn->newsockfd, comment); rc = SLURM_ERROR; goto end_it; } - rc = acct_storage_g_fix_lost_jobs(slurmdbd_conn->db_conn, *uid, - get_msg->my_list); + rc = acct_storage_g_fix_runaway_jobs(slurmdbd_conn->db_conn, *uid, + get_msg->my_list); end_it: slurmdbd_free_list_msg(get_msg); *out_buffer = make_dbd_rc_msg(slurmdbd_conn->rpc_version, - rc, comment, DBD_FIX_LOST_JOB); + rc, comment, DBD_FIX_RUNAWAY_JOB); return rc; } -- GitLab