From 806d66dc1553e1838c659a82363d3fcea3d5e93e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 30 Jun 2009 18:38:44 +0000
Subject: [PATCH] Modify job step cancel logic for scancel and srun (on reciept
 of SIGTERM     or three SIGINT) to immediately send SIGKILL to spawned tasks.
  Previous     logic would send SIGCONT, SIGTERM, wait KillWait seconds,
 SIGKILL. (Actually this is just a commit of the Totalview test for above
 logic.)

---
 NEWS                            |   3 +
 testsuite/expect/Makefile.am    |   2 +
 testsuite/expect/Makefile.in    |   2 +
 testsuite/expect/README         |   2 +-
 testsuite/expect/test7.5        | 214 ++++++++++++++++++++++++++++++++
 testsuite/expect/test7.5.prog.c |  73 +++++++++++
 6 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100755 testsuite/expect/test7.5
 create mode 100644 testsuite/expect/test7.5.prog.c

diff --git a/NEWS b/NEWS
index 552c34f08d7..1b9e657a18b 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,9 @@ documents those changes that are of interest to users and admins.
  -- Add node state flag for power-up/configuring. Represented by "#" suffix
     on the node state name (e.g. "ALLOCATED#") for command output.
  -- Add CONFIGURING/CF job state flag for node power-up/configuring.
+ -- Modify job step cancel logic for scancel and srun (on reciept of SIGTERM 
+    or three SIGINT) to immediately send SIGKILL to spawned tasks.  Previous 
+    logic would send SIGCONT, SIGTERM, wait KillWait seconds, SIGKILL.
 
 * Changes in SLURM 2.1.0-pre1
 =============================
diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am
index fb162b68081..0cbe65009f9 100644
--- a/testsuite/expect/Makefile.am
+++ b/testsuite/expect/Makefile.am
@@ -161,6 +161,8 @@ EXTRA_DIST = \
 	test7.3.prog.c			\
 	test7.4				\
 	test7.4.prog.c			\
+	test7.5				\
+	test7.5.prog.c			\
 	test7.6				\
 	test7.6.prog.c			\
 	test7.7				\
diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in
index 2b145e94482..3e43d73b653 100644
--- a/testsuite/expect/Makefile.in
+++ b/testsuite/expect/Makefile.in
@@ -405,6 +405,8 @@ EXTRA_DIST = \
 	test7.3.prog.c			\
 	test7.4				\
 	test7.4.prog.c			\
+	test7.5				\
+	test7.5.prog.c			\
 	test7.6				\
 	test7.6.prog.c			\
 	test7.7				\
diff --git a/testsuite/expect/README b/testsuite/expect/README
index 75d74bb379e..e9d6e5f2786 100644
--- a/testsuite/expect/README
+++ b/testsuite/expect/README
@@ -285,7 +285,7 @@ test7.3    Test of slurm_step_launch API with spawn_io=true
            (needed by poe on IBM AIX systems).
 test7.4    Test of TotalView operation with srun, with and without bulk 
            transfer.
-test7.5    REMOVED
+test7.5    Test of TotalView termination logic for srun.
 test7.6    Test of TotalView operation with sattach
 test7.7    Test of sched/wiki2 plugin. This is intended to execute in the 
            place of Moab or Maui and emulate its actions to confirm proper
diff --git a/testsuite/expect/test7.5 b/testsuite/expect/test7.5
new file mode 100755
index 00000000000..4dd4ed2e9f2
--- /dev/null
+++ b/testsuite/expect/test7.5
@@ -0,0 +1,214 @@
+#!/usr/bin/expect
+############################################################################
+# Purpose: Test of TotalView termination logic for srun.
+#
+# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
+#          "FAILURE: ..." otherwise with an explanation of the failure, OR
+#          anything else indicates a failure mode that must be investigated.
+#
+# Note:    This script generates and then deletes files in the working directory
+#          named test7.5.prog
+############################################################################
+# Copyright (C) 2002-2007 The Regents of the University of California.
+# Copyright (C) 2008-2009 Lawrence Livermore National Security.
+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+# Written by Morris Jette <jette1@llnl.gov>
+# CODE-OCEC-09-009. All rights reserved.
+# 
+# This file is part of SLURM, a resource management program.
+# For details, see <https://computing.llnl.gov/linux/slurm/>.
+# Please also read the included file: DISCLAIMER.
+#  
+# SLURM is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option)
+# any later version.
+# 
+# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with SLURM; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
+############################################################################
+source ./globals
+
+set test_id     "7.5"
+set exit_code   0
+set file_prog   "test$test_id.prog"
+set matches     0
+set fini_cnt    0
+set usr1cnt     0
+set usr2cnt     0
+
+print_header $test_id
+
+if { [test_xcpu] } {
+	send_user "\nWARNING: This test is incompatable with XCPU systems\n"
+	exit $exit_code
+}
+
+	
+#
+# Delete left-over program and rebuild it
+#
+exec $bin_rm -f $file_prog
+exec $bin_cc -O ${file_prog}.c -o $file_prog
+exec $bin_chmod 700 $file_prog
+
+#
+# Get uid
+#
+spawn $bin_id -u
+expect {
+	-re "($number)" {
+		set uid $expect_out(1,string)
+	}
+	eof {
+		wait
+	}
+}
+
+#
+# Spawn initial program via srun and terminate with SIGTERM
+# Note: For systems supporting proper pthreads, instead use
+#       exec $bin_kill -TERM $srun_pid, otherwise we need pkill 
+#       and can get multiple signals delivered
+# Note: We send the signal right after task startup rather than
+#	interspersed with messages because some versions of 
+#	Expect have difficulties handling unbuffered srun output
+#
+set timeout $max_job_delay
+set srun_pid [spawn $srun -N1 -t1 --debugger-test --unbuffered $file_prog]
+expect {
+	-re "task:.*, host:.*, pid:.*, executable:.*" {
+		incr matches
+		# sleep to make sure the process is actually running
+		exec $bin_sleep 1
+		exec $bin_kill -TERM $srun_pid
+		send_user "\nSent SIGTERM\n"
+		exp_continue
+	}
+	-re "Received signal" {
+		send_user "\nFAILURE: unexpected signal processed\n"
+		set exit_code 1
+		send_user "\nSent SIGTERM\n"
+		exp_continue
+	}
+	-re "WAITING" {
+		send_user "\nFAILURE: job not stopped in debugger mode\n"
+		set exit_code 1
+		send_user "\nSent SIGTERM\n"
+		exp_continue
+	}
+	-re "TIME LIMIT" {
+		send_user "\nFAILURE: job not terminated with SIGINT\n"
+		set exit_code 1
+		exp_continue
+	}
+	-re "error.*not running" {
+		send_user "\nDon't worry about the error...\n"
+		exp_continue
+	}
+	-re "FINI" {
+		incr fini_cnt
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: srun not responding\n"
+		slow_kill $srun_pid
+		set exit_code 1
+	}
+	eof {
+		send_user "\nEOF\n"
+		wait
+	}
+}
+if {$matches != 1} {
+	send_user "\nFAILURE: srun failed to initialize properly ($matches != 1)\n"
+	set exit_code 1
+}
+if {$fini_cnt > 0} {
+	send_user "\nFAILURE: srun failed to terminate properly ($fini_cnt > 0)\n"
+	set exit_code 1
+}
+
+if {$exit_code == 0} {
+	send_user "\n\nSo far, so good\n\n"
+} else {
+	exit $exit_code
+}
+
+#
+# Spawn initial program via srun and terminate with SIGINT * 3
+#
+set matches  0
+set fini_cnt 0
+set srun_pid [spawn $srun -N1 -t1 --debugger-test --unbuffered $file_prog]
+expect {
+	-re "task:.*, host:.*, pid:.*, executable:.*" {
+		incr matches
+		# sleep to make sure the process is actually running
+		exec $bin_sleep 1
+		exec $bin_kill -INT $srun_pid
+		exec $bin_usleep   1000
+		exec $bin_kill -INT $srun_pid
+		exec $bin_usleep   1000
+		exec $bin_kill -INT $srun_pid
+		send_user "\nSent SIGINT * 3\n"
+		exp_continue
+	}
+	-re "Received signal" {
+		send_user "\nFAILURE: unexpected signal processed\n"
+		set exit_code 1
+		send_user "\nSent SIGTERM\n"
+		exp_continue
+	}
+	-re "WAITING" {
+		send_user "\nFAILURE: job not stopped in debugger mode\n"
+		set exit_code 1
+		send_user "\nSent SIGTERM\n"
+		exp_continue
+	}
+	-re "TIME LIMIT" {
+		send_user "\nFAILURE: job not terminated with SIGINT\n"
+		set exit_code 1
+		exp_continue
+	}
+	-re "error.*not running" {
+		send_user "\nDon't worry about the error...\n"
+		exp_continue
+	}
+	-re "FINI" {
+		incr fini_cnt
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: srun not responding\n"
+		slow_kill $srun_pid
+		set exit_code 1
+	}
+	eof {
+		send_user "\nEOF\n"
+		wait
+	}
+}
+if {$matches != 1} {
+	send_user "\nFAILURE: srun failed to initialize properly ($matches != 1)\n"
+	set exit_code 1
+}
+if {$fini_cnt > 0} {
+	send_user "\nFAILURE: srun failed to terminate properly ($fini_cnt > 0)\n"
+	set exit_code 1
+}
+
+#
+# Post-processing
+#
+if {$exit_code == 0} {
+	exec $bin_rm -f $file_prog
+	send_user "\nSUCCESS\n"
+}
+exit $exit_code
diff --git a/testsuite/expect/test7.5.prog.c b/testsuite/expect/test7.5.prog.c
new file mode 100644
index 00000000000..76e80423a20
--- /dev/null
+++ b/testsuite/expect/test7.5.prog.c
@@ -0,0 +1,73 @@
+/*****************************************************************************\
+ *  prog7.5.prog.c - Simple signal catching test program for SLURM regression 
+ *  test7.5. Report caught signals. Block SIGTERM.
+ *****************************************************************************
+ *  Copyright (C) 2002-2007 The Regents of the University of California.
+ *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Morris Jette <jette1@llnl.gov>
+ *  CODE-OCEC-09-009. All rights reserved.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <https://computing.llnl.gov/linux/slurm/>.
+ *  Please also read the included file: DISCLAIMER.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
+\*****************************************************************************/
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+
+int sigterm_cnt = 0;
+
+void sig_handler(int sig)
+{
+	switch (sig)
+	{
+		case SIGTERM:
+			printf("Received SIGTERM\n");
+			fflush(stdout);
+			sigterm_cnt++;
+			break;
+		default:
+			printf("Received signal %d\n", sig);
+			fflush(stdout);
+	}
+}
+
+main (int argc, char **argv) 
+{
+	struct sigaction act;
+
+	act.sa_handler = sig_handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	if (sigaction(SIGTERM, &act, NULL) < 0) {
+		perror("setting SIGTERM handler");
+		exit(2);
+	}
+
+	printf("WAITING\n");
+	fflush(stdout);
+
+	sleep(160);
+
+	printf("FINI: term:%d\n", sigterm_cnt);
+	exit(0);
+}
-- 
GitLab