From a63e879e60a1e4b7a7f371c1710cfafb6369baa1 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 21 Jan 2003 23:07:48 +0000 Subject: [PATCH] scancel now defined to accept signal number argument. slurm_cancel_job RPC now takes signal argument. --- doc/man/man3/slurm_cancel_job.3 | 27 ++++++++------ src/api/cancel.c | 21 ++++++----- src/api/slurm.h | 13 ++++--- src/common/slurm_protocol_defs.c | 5 +++ src/common/slurm_protocol_defs.h | 8 ++++ src/common/slurm_protocol_pack.c | 53 +++++++++++++++++++++++++- src/scancel/opt.c | 64 +++++++++++++++++++++++++++++++- src/scancel/scancel.c | 41 +++++++++++--------- src/scancel/scancel.h | 5 ++- 9 files changed, 188 insertions(+), 49 deletions(-) diff --git a/doc/man/man3/slurm_cancel_job.3 b/doc/man/man3/slurm_cancel_job.3 index 641bb7b4df8..33cc5a8e506 100644 --- a/doc/man/man3/slurm_cancel_job.3 +++ b/doc/man/man3/slurm_cancel_job.3 @@ -1,21 +1,25 @@ -.TH "Slurm API" "3" "October 2002" "Morris Jette" "Slurm job cancel calls" +.TH "Slurm API" "3" "January 2003" "Morris Jette" "Slurm job cancel calls" .SH "NAME" -slurm_cancel_job, slurm_cancel_job_step \- Slurm job cancel calls +slurm_kill_job, slurm_kill_job_step \- Slurm job signal calls .SH "SYNTAX" .LP #include <slurm.h> .LP -int \fBslurm_cancel_job\fR ( +int \fBslurm_kill_job\fR ( .br - uint32_t \fIjob_id\fP + uint32_t \fIjob_id\fP, +.br + uint16_t \fIsignal\fP .br ); .LP -int \fBslurm_cancel_job_step\fR ( +int \fBslurm_kill_job_step\fR ( .br uint32_t \fIjob_id\fP, .br - uint32_t \fIjob_step_id\fP + uint32_t \fIjob_step_id\fP, +.br + uint16_t \fIsignal\fP .br ); .SH "ARGUMENTS" @@ -26,17 +30,18 @@ Slurm job id number. .TP \fIjob_step_id\fp Slurm job step id number. +.TP +\fIsignal\fp +Signal to be sent to the job or job step. .SH "DESCRIPTION" .LP -\fBslurm_cancel_job\fR Request the cancellation of a running or pending job. This function +\fBslurm_kill_job\fR Request that a signal be sent to a job and all of its job steps. If the job is pending, it will be terminated immediately.This function may only be successfully executed by the job's owner or user root. .LP -\fBslurm_cancel_job_step\fR Request the cancellation of a running job step. This function -may only be successfully executed by the job's owner or user root. +\fBslurm_kill_job_step\fR Request that a signal be sent to a specific job step. This function may only be successfully executed by the job's owner or user root. .SH "RETURN VALUE" .LP -On success, zero is returned. On error, -1 is returned, and Slurm error code is set -appropriately. +On success, zero is returned. On error, -1 is returned, and Slurm error code is set appropriately. .SH "ERRORS" .LP \fBSLURM_PROTOCOL_VERSION_ERROR\fR Protocol version has changed, re-link your code. diff --git a/src/api/cancel.c b/src/api/cancel.c index 1c4da241716..e468a9be5f3 100644 --- a/src/api/cancel.c +++ b/src/api/cancel.c @@ -36,31 +36,33 @@ #include "src/common/slurm_protocol_api.h" /* - * slurm_cancel_job - cancel an existing job and all of its steps + * slurm_kill_job - send the specified signal to all steps of an existing job * IN job_id - the job's id + * IN signal - signal number * RET 0 on success or slurm error code */ int -slurm_cancel_job ( uint32_t job_id ) +slurm_kill_job ( uint32_t job_id, uint16_t signal ) { - return slurm_cancel_job_step ( job_id, NO_VAL); + return slurm_kill_job_step ( job_id, NO_VAL, signal ); } /* - * slurm_cancel_job_step - cancel a specific job step + * slurm_kill_job_step - send the specified signal to an existing job step * IN job_id - the job's id * IN step_id - the job step's id + * IN signal - signal number * RET 0 on success or slurm error code */ int -slurm_cancel_job_step ( uint32_t job_id, uint32_t step_id ) +slurm_kill_job_step ( uint32_t job_id, uint32_t step_id, uint16_t signal ) { int msg_size ; int rc ; slurm_fd sockfd ; slurm_msg_t request_msg ; slurm_msg_t response_msg ; - job_step_id_msg_t job_step_id_msg ; + job_step_kill_msg_t job_step_kill_msg ; return_code_msg_t * slurm_rc_msg ; /* init message connection for message communication with controller */ @@ -71,10 +73,11 @@ slurm_cancel_job_step ( uint32_t job_id, uint32_t step_id ) } /* send request message */ - job_step_id_msg . job_id = job_id ; - job_step_id_msg . job_step_id = step_id ; + job_step_kill_msg . job_id = job_id ; + job_step_kill_msg . job_step_id = step_id ; + job_step_kill_msg . signal = signal ; request_msg . msg_type = REQUEST_CANCEL_JOB_STEP ; - request_msg . data = &job_step_id_msg ; + request_msg . data = &job_step_kill_msg ; if ( ( rc = slurm_send_controller_msg ( sockfd , & request_msg ) ) == SLURM_SOCKET_ERROR ) { slurm_seterrno ( SLURM_COMMUNICATIONS_SEND_ERROR ); diff --git a/src/api/slurm.h b/src/api/slurm.h index 31f87fc9e58..121fbfac1a7 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -487,23 +487,26 @@ extern int slurm_job_will_run (job_desc_msg_t * job_desc_msg , /*****************************************************************************\ - * JOB/STEP CANCELATION FUNCTIONS + * JOB/STEP SIGNALING FUNCTIONS \*****************************************************************************/ /* - * slurm_cancel_job - cancel an existing job and all of its steps + * slurm_kill_job - send the specified signal to all steps of an existing job * IN job_id - the job's id + * IN signal - signal number * RET 0 on success or slurm error code */ -extern int slurm_cancel_job (uint32_t job_id); +extern int slurm_kill_job (uint32_t job_id, uint16_t signal); /* - * slurm_cancel_job_step - cancel a specific job step + * slurm_kill_job_step - send the specified signal to an existing job step * IN job_id - the job's id * IN step_id - the job step's id + * IN signal - signal number * RET 0 on success or slurm error code */ -extern int slurm_cancel_job_step (uint32_t job_id, uint32_t step_id); +extern int slurm_kill_job_step (uint32_t job_id, uint32_t step_id, + uint16_t signal); /*****************************************************************************\ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 5b5fe340270..9788ec80aff 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -85,6 +85,11 @@ void slurm_free_batch_resp_msg(batch_launch_response_msg_t * msg) FREE_IF_SET(msg); } +void slurm_free_job_step_kill_msg(job_step_kill_msg_t * msg) +{ + FREE_IF_SET(msg); +} + void slurm_free_job_desc_msg(job_desc_msg_t * msg) { int i; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index e787c5858a1..fe4fccd6535 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -194,6 +194,12 @@ typedef struct job_step_id { uint32_t job_step_id; } job_step_id_t; +typedef struct job_step_kill_msg { + uint32_t job_id; + uint32_t job_step_id; + uint16_t signal; +} job_step_kill_msg_t; + typedef struct job_id_msg { uint32_t job_id; } job_id_msg_t; @@ -412,6 +418,8 @@ void inline slurm_free_update_job_time_msg(job_time_msg_t * msg); void inline slurm_free_batch_resp_msg(batch_launch_response_msg_t * msg); +void inline slurm_free_job_step_kill_msg(job_step_kill_msg_t * msg); + extern char *job_dist_string(uint16_t inx); extern char *job_state_string(enum job_states inx); extern char *job_state_string_compact(enum job_states inx); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 8bdfed08024..35dc052d012 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -203,6 +203,10 @@ static void _pack_batch_job_resp_msg(batch_launch_response_msg_t * msg, static int _unpack_batch_job_resp_msg(batch_launch_response_msg_t ** msg, Buf buffer); +static void _pack_job_step_kill_msg(job_step_kill_msg_t * msg, Buf buffer); +static int _unpack_job_step_kill_msg(job_step_kill_msg_t ** msg_ptr, + Buf buffer); + static void _pack_buffer_msg(slurm_msg_t * msg, Buf buffer); /* pack_header @@ -411,9 +415,12 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) break; /******** job_step_id_t Messages ********/ case REQUEST_JOB_INFO: - case REQUEST_CANCEL_JOB_STEP: _pack_job_step_id_msg((job_step_id_t *) msg->data, buffer); break; + case REQUEST_CANCEL_JOB_STEP: + _pack_job_step_kill_msg((job_step_kill_msg_t *) + msg->data, buffer); + break; case REQUEST_COMPLETE_JOB_STEP: _pack_complete_job_step_msg((complete_job_step_msg_t *) msg->data, buffer); @@ -619,10 +626,13 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) break; /******** job_step_id_t Messages ********/ case REQUEST_JOB_INFO: - case REQUEST_CANCEL_JOB_STEP: rc = _unpack_job_step_id_msg((job_step_id_t **) & (msg->data), buffer); break; + case REQUEST_CANCEL_JOB_STEP: + rc = _unpack_job_step_kill_msg((job_step_kill_msg_t **) + & (msg->data), buffer); + break; case REQUEST_COMPLETE_JOB_STEP: rc = _unpack_complete_job_step_msg((complete_job_step_msg_t **) & (msg->data), @@ -2212,6 +2222,45 @@ _unpack_job_step_id_msg(job_step_id_t ** msg_ptr, Buf buffer) return SLURM_ERROR; } +/* _pack_job_step_kill_msg + * packs a slurm job step signal message + * IN msg - pointer to the job step signal message + * IN/OUT buffer - destination of the pack, contains pointers that are + * automatically updated + */ +static void +_pack_job_step_kill_msg(job_step_kill_msg_t * msg, Buf buffer) +{ + pack32(msg->job_id, buffer); + pack32(msg->job_step_id, buffer); + pack16(msg->signal, buffer); +} + +/* _unpack_job_step_kill_msg + * unpacks a slurm job step signal message + * OUT msg_ptr - pointer to the job step signal message buffer + * IN/OUT buffer - source of the unpack, contains pointers that are + * automatically updated + */ +static int +_unpack_job_step_kill_msg(job_step_kill_msg_t ** msg_ptr, Buf buffer) +{ + job_step_kill_msg_t *msg; + + msg = xmalloc(sizeof(job_step_kill_msg_t)); + *msg_ptr = msg; + + safe_unpack32(&msg->job_id, buffer); + safe_unpack32(&msg->job_step_id, buffer); + safe_unpack16(&msg->signal, buffer); + return SLURM_SUCCESS; + + unpack_error: + FREE_NULL(msg); + *msg_ptr = NULL; + return SLURM_ERROR; +} + static void _pack_complete_job_step_msg(complete_job_step_msg_t * msg, Buf buffer) { diff --git a/src/scancel/opt.c b/src/scancel/opt.c index 8dbd8d6beec..f2fdc94fb81 100644 --- a/src/scancel/opt.c +++ b/src/scancel/opt.c @@ -31,6 +31,7 @@ #endif #include <pwd.h> +#include <signal.h> #include <stdlib.h> #include <string.h> /* strcpy, strncasecmp */ #include <sys/types.h> @@ -48,6 +49,8 @@ #define __DEBUG 0 +#define SIZE(a) (sizeof(a)/sizeof(a[0])) + /*---[ popt definitions ]------------------------------------------------*/ /* generic OPT_ definitions -- mainly for use with env vars @@ -65,6 +68,7 @@ #define OPT_USER 0x07 #define OPT_VERBOSE 0x08 #define OPT_VERSION 0x09 +#define OPT_SIGNAL 0x0a #ifndef POPT_TABLEEND @@ -78,7 +82,9 @@ struct poptOption options[] = { "name of job", "name"}, {"partition", 'p', POPT_ARG_STRING, NULL, OPT_PARTITION, "name of job's partition", "name"}, - {"state", 's', POPT_ARG_STRING, NULL, OPT_STATE, + {"signal", 's', POPT_ARG_STRING, NULL, OPT_SIGNAL, + "signal name or number", "name | integer"}, + {"state", 't', POPT_ARG_STRING, NULL, OPT_STATE, "name of job's state", "PENDING | RUNNING"}, {"user", 'u', POPT_ARG_STRING, NULL, OPT_USER, "name of job's owner", "name"}, @@ -90,6 +96,22 @@ struct poptOption options[] = { POPT_TABLEEND }; +struct signv { + char *name; + uint16_t val; +} sys_signame[ ] = { + { "HUP", SIGHUP }, + { "INT", SIGINT }, + { "QUIT", SIGQUIT }, + { "KILL", SIGKILL }, + { "ALRM", SIGALRM }, + { "TERM", SIGTERM }, + { "USR1", SIGUSR1 }, + { "USR2", SIGUSR2 }, + { "STOP", SIGSTOP }, + { "CONT", SIGCONT } +}; + /*---[ end popt definitions ]---------------------------------------------*/ /* forward declarations of static functions @@ -119,6 +141,10 @@ static void print_version (void); */ static enum job_states xlate_state_name(const char *state_name); +/* translate name name to number + */ +static uint16_t xlate_signal_name(const char *signal_name); + /* list known options and their settings */ #if __DEBUG @@ -165,12 +191,40 @@ static enum job_states xlate_state_name(const char *state_name) xstrcat(state_names, ","); xstrcat(state_names, job_state_string(i)); } - fprintf (stderr, "Valid job states include: %s", state_names); + fprintf (stderr, "Valid job states include: %s\n", state_names); xfree (state_names); exit (1); } +static uint16_t xlate_signal_name(const char *signal_name) +{ + uint16_t sig_num; + char *end_ptr, *sig_names; + int i; + + sig_num = (uint16_t) strtol(signal_name, &end_ptr, 10); + if ((*end_ptr == '\0') || (sig_num != 0)) + return sig_num; + + for (i=0; i<SIZE(sys_signame); i++) { + if (strcasecmp(sys_signame[i].name, signal_name) == 0) { + xfree(sig_names); + return sys_signame[i].val; + } + if (i == 0) + sig_names = xstrdup(sys_signame[i].name); + else { + xstrcat(sig_names, ","); + xstrcat(sig_names, sys_signame[i].name); + } + } + fprintf (stderr, "Invalid job signal: %s\n", signal_name); + fprintf (stderr, "Valid signals include: %s\n", sig_names); + xfree(sig_names); + exit(1); +} + static void print_version (void) { printf("%s %s\n", PACKAGE, VERSION); @@ -185,6 +239,7 @@ static void opt_default() opt.job_cnt = 0; opt.job_name = NULL; opt.partition = NULL; + opt.signal = SIGKILL; opt.state = JOB_END; opt.user_name = NULL; opt.user_id = 0; @@ -279,6 +334,10 @@ static void opt_args(int ac, char **av) opt.partition = xstrdup(arg); break; + case OPT_SIGNAL: + opt.signal = xlate_signal_name(arg); + break; + case OPT_STATE: opt.state = xlate_state_name(arg); break; @@ -415,6 +474,7 @@ opt_list(void) info("interactive : %s", tf_(opt.interactive)); info("job_name : %s", opt.job_name); info("partition : %s", opt.partition); + info("signal : %u", opt.signal); info("state : %s", job_state_string(opt.state)); info("user_id : %u", opt.user_id); info("user_name : %s", opt.user_name); diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index 0e578b19e10..dce44e5d13e 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -25,7 +25,7 @@ \*****************************************************************************/ #if HAVE_CONFIG_H -# include <config.h> +# include "config.h" #endif #include <unistd.h> @@ -51,8 +51,9 @@ #define MAX_CANCEL_RETRY 10 static void cancel_jobs (void); -static void cancel_job_id (uint32_t job_id); -static void cancel_step_id (uint32_t job_id, uint32_t step_id); +static void _cancel_job_id (uint32_t job_id, uint16_t signal); +static void _cancel_step_id (uint32_t job_id, uint32_t step_id, + uint16_t signal); static int confirmation (int i); static void filter_job_records (void); static void load_job_records (void); @@ -173,10 +174,12 @@ cancel_jobs (void) if (opt.interactive && (confirmation(i) == 0)) break; if (opt.step_id[j] == NO_VAL) - cancel_job_id (opt.job_id[j]); + _cancel_job_id (opt.job_id[j], + opt.signal); else - cancel_step_id (opt.job_id[j], - opt.step_id[j]); + _cancel_step_id (opt.job_id[j], + opt.step_id[j], + opt.signal); break; } if (i >= job_buffer_ptr->record_count) @@ -187,10 +190,12 @@ cancel_jobs (void) } else if (opt.job_cnt) { /* delete specific jobs */ for (j = 0; j < opt.job_cnt; j++ ) { if (opt.step_id[j] == NO_VAL) - cancel_job_id (opt.job_id[j]); + _cancel_job_id (opt.job_id[j], + opt.signal); else - cancel_step_id (opt.job_id[j], - opt.step_id[j]); + _cancel_step_id (opt.job_id[j], + opt.step_id[j], + opt.signal); } } else { /* delete all jobs per filtering */ @@ -200,19 +205,19 @@ cancel_jobs (void) continue; if (opt.interactive && (confirmation(i) == 0)) continue; - cancel_job_id (job_ptr[i].job_id); + _cancel_job_id (job_ptr[i].job_id, opt.signal); } } } static void -cancel_job_id (uint32_t job_id) +_cancel_job_id (uint32_t job_id, uint16_t signal) { int error_code, i; for (i=0; i<MAX_CANCEL_RETRY; i++) { - verbose("cancelling job %u", job_id); - error_code = slurm_cancel_job (job_id); + verbose("Killing job %u", job_id); + error_code = slurm_kill_job (job_id, signal); if ((error_code == 0) || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE)) break; @@ -220,19 +225,19 @@ cancel_job_id (uint32_t job_id) sleep ( 5 + i ); } if (error_code) { - fprintf (stderr, "Cancel job error on job id %u: %s\n", + fprintf (stderr, "Kill job error on job id %u: %s\n", job_id, slurm_strerror(slurm_get_errno())); } } static void -cancel_step_id (uint32_t job_id, uint32_t step_id) +_cancel_step_id (uint32_t job_id, uint32_t step_id, uint16_t signal) { int error_code, i; for (i=0; i<MAX_CANCEL_RETRY; i++) { - verbose("cancelling steo %u.%u", job_id, step_id); - error_code = slurm_cancel_job_step (job_id, step_id); + verbose("Killing step %u.%u", job_id, step_id); + error_code = slurm_kill_job_step (job_id, step_id, signal); if ((error_code == 0) || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE)) break; @@ -240,7 +245,7 @@ cancel_step_id (uint32_t job_id, uint32_t step_id) sleep ( 5 + i ); } if (error_code) { - fprintf (stderr, "Cancel job error on job id %u.%u: %s\n", + fprintf (stderr, "Kill job error on job id %u.%u: %s\n", job_id, step_id, slurm_strerror(slurm_get_errno())); } } diff --git a/src/scancel/scancel.h b/src/scancel/scancel.h index c94a16e05c5..81027270c7d 100644 --- a/src/scancel/scancel.h +++ b/src/scancel/scancel.h @@ -28,7 +28,7 @@ #define _HAVE_SCANCEL_H #if HAVE_CONFIG_H -#include <config.h> +#include "config.h" #endif /* @@ -53,7 +53,8 @@ typedef struct scancel_options { bool interactive; /* --interactive, -i */ char *job_name; /* --name=n, -nn */ char *partition; /* --partition=n, -pn */ - enum job_states state; /* --state=n, -sn */ + uint16_t signal; /* --signal=n, -sn */ + enum job_states state; /* --state=n, -tn */ uid_t user_id; /* --user=n, -un */ char *user_name; /* --user=n, -un */ int verbose; /* --verbose, -v */ -- GitLab