Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/expect
############################################################################
# Purpose: Test of SLURM functionality
# Test scancel --nodelist option.
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2008 Lawrence Livermore National Security
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by David Bremer <dbremer@llnl.gov>
# LLNL-CODE-402394.
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.llnl.gov/linux/slurm/>.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set test_id "6.14"
set exit_code 0
set file_in "test$test_id.input"
set num_procs 10
set ii 0
set job_id ""
set job_map {}
set found 0
set tmp_job_list {}
set tmp_map_entry {}
set submitted_jobs {}
set job_list {}
set job_index -1
print_header $test_id
#
# Build input script file
#
make_bash_script $file_in "$srun $bin_sleep 600"
#
# Submit some jobs so we have something to work with
#
for {set ii 0} {$ii < $num_procs} {incr ii} {
set sbatch_pid [spawn $sbatch --output=/dev/null --error=/dev/null -n1 -N1 $file_in]
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
lappend submitted_jobs $job_id
exp_continue
}
timeout {
send_user "\nFAILURE: srun not responding\n"
slow_kill $sbatch_pid
set exit_code 1
}
eof {
wait
}
}
if {$job_id == 0} {
send_user "\nFAILURE: job submit failure\n"
exit 1
}
}
#
# Run squeue and build a map, implemented as a list of list of lists, like so:
# { {node1 {job1 job2 job3}}
# {node2 {job4 job5}}
# }
#
# Only put jobs into the map if they were submitted by this test.
#
spawn $squeue -h -t running -u $env(USER) -o "%10i %40N"
expect {
-re "($number) *($alpha_numeric) *\r\n" {
set job_id $expect_out(1,string)
set node_name $expect_out(2,string)
#This test doesn't need to track jobs that it didn't submit.
if { [lsearch $submitted_jobs $job_id] == -1 } {
exp_continue
}
#send_user "job $job_id: node $node_name\n"
#Insert into a table with node_name as the key, job_id as the value
set found 0
for {set ii 0} {$ii < [llength $job_map]} {incr ii} {
if { [lindex [lindex $job_map $ii] 0] == $node_name } {
set tmp_map_entry [list $node_name [concat [lindex [lindex $job_map $ii] 1] $job_id]]
set job_map [lreplace $job_map $ii $ii $tmp_map_entry]
set found 1
break
}
}
if {$found == 0} {
lappend job_map [list $node_name [list $job_id] ]
}
exp_continue
}
}
#send_user "job map: $job_map\n"
#
# Issue an scancel command against each node in the map described above.
# Remove entries from the internal list, and ensure that the list is
# empty at the end of the scancel call.
#
for {set ii 0} {$ii < [llength $job_map]} {incr ii} {
set node_name [lindex [lindex $job_map $ii] 0]
set job_list [lindex [lindex $job_map $ii] 1]
if {$ii == 0} {
spawn $scancel -v -u $env(USER) --nodelist $node_name
} else {
spawn $scancel -v -u $env(USER) -w $node_name
}
expect {
-re "scancel: Terminating job ($number)" {
#Search for the terminated job in the list recently
#returned from squeue. Don't worry if an unknown job
#gets cancelled, because maybe one of our submitted
#jobs will start running while we cancel other jobs
#Issue cancel commands node by node until all the
#jobs submitted for this test are gone.
set job_id $expect_out(1,string)
set job_index [lsearch $job_list $job_id]
if {$job_index != -1} {
set job_list [lreplace $job_list $job_index $job_index]
}
set job_index [lsearch $submitted_jobs $job_id]
if {$job_index != -1} {
set submitted_jobs [lreplace $submitted_jobs $job_index $job_index]
}
exp_continue
}
timeout {
send_user "\nFAILURE: scancel not responding while cancelling for node $node_name\n"
set exit_code 1
}
eof {
wait
}
}
if { [llength $job_list] != 0 } {
send_user "\nFAILURE: scancel did not remove jobs $job_list from node $node_name\n"
set exit_code 1
}
}
#
# Clean up any jobs submitted by this test, which were not mapped to a node,
# and thus not cancelled in the previous block of code
#
foreach job_id $submitted_jobs {
spawn $scancel $job_id
expect {
timeout {
send_user "\nFAILURE: scancel not responding while cancelling job $job_id\n"
set exit_code 1
}
eof {
wait
}
}
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
}
exit $exit_code