diff --git a/NEWS b/NEWS index 05bf09172789658094063b7304a71d26595594f7..a44959fd16c151bfb16f6b9f762c6e76100c6c82 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,9 @@ documents those changes that are of interest to users and admins. message if window too small (Dan Palermo, HP, patch.1.0.0.1.060126.smap). -- Sacct mods for inconsistent records (race condition) and replace --debug option with --verbose (Andy Riebs, HP, slurm.hp.sacct_exp_vvv.patch). + -- scancel of a job step will now send a job-step-completed message + to the controller after verifying that the step has completed on all nodes. + -- Fix task layout bug in srun. * Changes in SLURM 1.0.1 ======================== @@ -155,6 +158,8 @@ documents those changes that are of interest to users and admins. ========================= -- Fix bug in sinfo partition sorting order. -- Fix bugs in srun use of #SLURM options in batch script. + -- Use full Elan credential space rather than re-using credentials as soon + as job step completes (helps with fault-tolerance). * Changes in SLURM 0.6.10 ========================= diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index b672decbdce967eb355c3fd46ed98333e81d9b45..804034dc5ef437745ece24dbb0729b6a7c007483 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -833,6 +833,29 @@ extern int slurm_complete_job_step PARAMS(( uint32_t job_id, uint32_t step_id, uint32_t job_return_code, uint32_t system_return_code)); +/* + * slurm_terminate_job - terminates all steps of an existing job by sending + * a REQUEST_TERMINATE_JOB rpc to all slurmd in the the job allocation, + * and then calls slurm_complete_job(). + * IN job_id - the job's id + * RET 0 on success or slurm error code + */ +extern int +slurm_terminate_job PARAMS((uint32_t job_id)); + +/* + * slurm_terminate_job_step - terminates a job step by sending a + * REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step, and then + * calls slurm_complete_job_step() after verifying that all + * nodes in the job step no longer have running tasks from the job + * step. (May take over 35 seconds to return.) + * IN job_id - the job's id + * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id + * to terminate a job's batch script + * RET 0 on success or slurm error code + */ +extern int slurm_terminate_job_step PARAMS(( + uint32_t job_id, uint32_t step_id)); /*****************************************************************************\ * SLURM TASK SPAWNING FUNCTIONS diff --git a/src/api/complete.c b/src/api/complete.c index 19b34f82bb9685e25a21471c8c965d629cea2263..a95dceb339479ab75b07093f3da59bc25c3d4f22 100644 --- a/src/api/complete.c +++ b/src/api/complete.c @@ -49,8 +49,8 @@ int slurm_complete_job ( uint32_t job_id, uint32_t job_return_code, uint32_t system_return_code ) { - return slurm_complete_job_step ( job_id, NO_VAL, job_return_code, - system_return_code); + return slurm_complete_job_step ( job_id, SLURM_BATCH_SCRIPT, + job_return_code, system_return_code); } /* diff --git a/src/api/signal.c b/src/api/signal.c index 91b7bd22d0b13e6220913c31c7b9d251d76cedc6..7aea4f50013d3969f0a9eca8892c7db71c7a302a 100644 --- a/src/api/signal.c +++ b/src/api/signal.c @@ -49,6 +49,13 @@ static int _signal_job_step(const job_step_info_t *step, uint16_t signal); static int _signal_batch_script_step( const resource_allocation_response_msg_t *allocation, uint16_t signal); +static int _terminate_job_step(const job_step_info_t *step, + const resource_allocation_response_msg_t *allocation); +static int _job_step_wait(uint32_t jobid, uint32_t stepid, + const slurm_addr addresses[], int num_nodes, + int wait_time); +static int _terminate_batch_script_step( + const resource_allocation_response_msg_t *allocation); static int _p_send_recv_rc_msg(int num_nodes, slurm_msg_t msg[], int rc[], int timeout); static void *_thr_send_recv_rc_msg(void *args); @@ -67,7 +74,7 @@ struct send_recv_rc { * IN signal - signal number * RET 0 on success or slurm error code */ -int +extern int slurm_signal_job (uint32_t job_id, uint16_t signal) { int rc = SLURM_SUCCESS; @@ -123,7 +130,7 @@ fail1: * IN signal - signal number * RET 0 on success or slurm error code */ -int +extern int slurm_signal_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal) { resource_allocation_response_msg_t *alloc_info; @@ -177,7 +184,11 @@ fail1: } } -void +/* + * Retrieve the host address from the "allocation" structure for each + * node in the specified "step". + */ +static void _get_step_addresses(const job_step_info_t *step, const resource_allocation_response_msg_t *allocation, slurm_addr **address, int *num_addresses) @@ -357,3 +368,268 @@ _thr_send_recv_rc_msg(void *args) pthread_cond_signal(cond); slurm_mutex_unlock(lock); } + +/* + * slurm_terminate_job - terminates all steps of an existing job by sending + * a REQUEST_TERMINATE_JOB rpc to all slurmd in the the job allocation, + * and then calls slurm_complete_job(). + * IN job_id - the job's id + * RET 0 on success or slurm error code + */ +extern int +slurm_terminate_job (uint32_t job_id) +{ + int rc = SLURM_SUCCESS; + resource_allocation_response_msg_t *alloc_info; + slurm_msg_t *msg; /* array of message structs, one per node */ + signal_job_msg_t rpc; + int *rc_array; + int i; + + if (slurm_allocation_lookup(job_id, &alloc_info)) { + rc = slurm_get_errno(); + goto fail1; + } + + /* same remote procedure call for each node */ + rpc.job_id = job_id; + rpc.signal = (uint32_t)-1; /* not used by slurmd */ + + msg = xmalloc(sizeof(slurm_msg_t) * alloc_info->node_cnt); + rc_array = xmalloc(sizeof(int) * alloc_info->node_cnt); + for (i = 0; i < alloc_info->node_cnt; i++) { + msg[i].msg_type = REQUEST_TERMINATE_JOB; + msg[i].data = &rpc; + msg[i].address = alloc_info->node_addr[i]; + } + + _p_send_recv_rc_msg(alloc_info->node_cnt, msg, rc_array, 10); + + for (i = 0; i < alloc_info->node_cnt; i++) { + if (rc_array[i]) { + rc = rc_array[i]; + break; + } + } + + xfree(msg); + xfree(rc_array); + slurm_free_resource_allocation_response_msg(alloc_info); + + slurm_complete_job(job_id, 0, 0); +fail1: + if (rc) { + slurm_seterrno_ret(rc); + return SLURM_FAILURE; + } else { + return SLURM_SUCCESS; + } +} + +/* + * slurm_terminate_job_step - terminates a job step by sending a + * REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step, and then + * it calls slurm_complete_job_step() after verifying that all + * nodes in the job step no longer have running tasks from the job + * step. (May take over 35 seconds to return.) + * IN job_id - the job's id + * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id + * to terminate a job's batch script + * RET 0 on success or slurm error code + */ +extern int +slurm_terminate_job_step (uint32_t job_id, uint32_t step_id) +{ + resource_allocation_response_msg_t *alloc_info; + job_step_info_response_msg_t *step_info; + int rc; + int i; + + if (slurm_allocation_lookup(job_id, &alloc_info)) { + rc = slurm_get_errno(); + goto fail1; + } + + /* + * The controller won't give us info about the batch script job step, + * so we need to handle that seperately. + */ + if (step_id == SLURM_BATCH_SCRIPT) { + rc = _terminate_batch_script_step(alloc_info); + goto done; + } + + /* + * Otherwise, look through the list of job step info and find + * the one matching step_id. Terminate that step. + */ + rc = slurm_get_job_steps((time_t)0, job_id, step_id, &step_info, SHOW_ALL); + if (rc != 0) + goto fail2; + for (i = 0; i < step_info->job_step_count; i++) { + printf("slurm_terminate_job_step job_id=%u, stepid=%u\n", + step_info->job_steps[i].job_id, + step_info->job_steps[i].step_id); + if (step_info->job_steps[i].job_id == job_id + && step_info->job_steps[i].step_id == step_id) { + rc = _terminate_job_step(&step_info->job_steps[i], + alloc_info); + break; + } + } + slurm_free_job_step_info_response_msg(step_info); +fail2: +done: + slurm_free_resource_allocation_response_msg(alloc_info); +fail1: + if (rc) { + slurm_seterrno_ret(rc); + return SLURM_FAILURE; + } else { + return SLURM_SUCCESS; + } +} + + +/* + * Poll the slurmds at the addresses listed in "addresses" for the + * existence of the specified job step. + * + * Return 0 if the job step is completely terminated. Otherwise, -1 + * shall be returned. + */ +static int +_job_step_wait(uint32_t jobid, uint32_t stepid, + const slurm_addr addresses[], int num_nodes, + int wait_time) +{ + slurm_msg_t *msg; /* array of message structs, one per node */ + kill_tasks_msg_t rpc; + int *rc_array; + int rc = -1; + int i; + time_t start_time; + + /* same remote procedure call for each node */ + rpc.job_id = jobid; + rpc.job_step_id = stepid; + rpc.signal = 0; + + msg = xmalloc(sizeof(slurm_msg_t) * num_nodes); + rc_array = xmalloc(sizeof(int) * num_nodes); + for (i = 0; i < num_nodes; i++) { + msg[i].msg_type = REQUEST_SIGNAL_TASKS; + msg[i].data = &rpc; + msg[i].address = addresses[i]; + } + + start_time = time(NULL); + while(time(NULL) < start_time+wait_time && rc != 0) { + _p_send_recv_rc_msg(num_nodes, msg, rc_array, 10); + + rc = 0; + for (i = 0; i < num_nodes; i++) { + if (rc_array[i] != ESLURM_INVALID_JOB_ID) { + rc = -1; + break; + } + } + if (rc == 0) + break; + else + sleep(2); + } + + xfree(rc_array); + xfree(msg); + + return rc; +} + +/* + * Send a REQUEST_TERMINATE_TASKS rpc to all nodes in a job step. Then + * poll the slurmds for up to 35 seconds (with REQUEST_SIGNAL_TASKS) + * waiting for the job step to completely terminate. Finally, if all + * slurmds report ESLURM_INVALID_JOB_ID then send REQUEST_COMPLETE_JOB_STEP + * to the slurmctld. + * + * RET Upon successful termination of the job step, 0 shall be returned. + * Otherwise, -1 shall be returned and errno set to indicate the error. + */ +static int +_terminate_job_step(const job_step_info_t *step, + const resource_allocation_response_msg_t *allocation) +{ + slurm_msg_t *msg; /* array of message structs, one per node */ + kill_tasks_msg_t rpc; + slurm_addr *address; + int num_nodes; + int *rc_array; + int rc = SLURM_SUCCESS; + int i; + + _get_step_addresses(step, allocation, + &address, &num_nodes); + + /* + * Send REQUEST_TERMINATE_TASKS to all nodes of the step + */ + rpc.job_id = step->job_id; + rpc.job_step_id = step->step_id; + rpc.signal = (uint32_t)-1; /* not used by slurmd */ + + msg = xmalloc(sizeof(slurm_msg_t) * num_nodes); + rc_array = xmalloc(sizeof(int) * num_nodes); + for (i = 0; i < num_nodes; i++) { + msg[i].msg_type = REQUEST_TERMINATE_TASKS; + msg[i].data = &rpc; + msg[i].address = address[i]; + } + + _p_send_recv_rc_msg(num_nodes, msg, rc_array, 10); + + xfree(msg); + xfree(rc_array); + + /* + * Wait until all nodes report that the step is gone + */ + rc = _job_step_wait(step->job_id, step->step_id, + address, num_nodes, 35); + + xfree(address); + + /* + * If the job step is really gone, then signal the controller + * with the job step completion message. + */ + if (rc == 0) { + rc = slurm_complete_job_step(step->job_id, step->step_id, 0, 0); + } + + return rc; +} + +static int _terminate_batch_script_step( + const resource_allocation_response_msg_t *allocation) +{ + slurm_msg_t msg; + kill_tasks_msg_t rpc; + int num_nodes; + int rc = SLURM_SUCCESS; + int i; + + rpc.job_id = allocation->job_id; + rpc.job_step_id = SLURM_BATCH_SCRIPT; + rpc.signal = (uint32_t)-1; /* not used by slurmd */ + + msg.msg_type = REQUEST_TERMINATE_TASKS; + msg.data = &rpc; + msg.address = allocation->node_addr[0]; + + rc = slurm_send_recv_rc_msg(&msg, 10); + + return rc; +} + + diff --git a/src/common/bitstring.c b/src/common/bitstring.c index 8ac43f88babfa496dccb5bba43ac5683465d61a9..7583156a527cf773a17131df0a04a93349420638 100644 --- a/src/common/bitstring.c +++ b/src/common/bitstring.c @@ -65,6 +65,7 @@ strong_alias(bit_copy, slurm_bit_copy); strong_alias(bit_pick_cnt, slurm_bit_pick_cnt); strong_alias(bitfmt2int, slurm_bitfmt2int); strong_alias(bit_nffc, slurm_bit_nffc); +strong_alias(bit_noc, slurm_bit_noc); strong_alias(bit_nffs, slurm_bit_nffs); strong_alias(bit_copybits, slurm_bit_copybits); strong_alias(bit_unfmt, slurm_bit_unfmt); @@ -266,9 +267,9 @@ bit_nffc(bitstr_t *b, int n) int cnt = 0; _assert_bitstr_valid(b); - assert(n > 0 && n <= _bitstr_bits(b)); + assert(n > 0 && n < _bitstr_bits(b)); - for (bit = 0; bit <= _bitstr_bits(b) - n; bit++) { + for (bit = 0; bit <= _bitstr_bits(b); bit++) { if (bit_test(b, bit)) { /* fail */ cnt = 0; } else { @@ -283,6 +284,55 @@ bit_nffc(bitstr_t *b, int n) return value; } +/* Find n contiguous bits clear in b starting at some offset. + * b (IN) bitstring to search + * n (IN) number of bits needed + * seed (IN) position at which to begin search + * RETURN position of first bit in range (-1 if none found) + */ +bitoff_t +bit_noc(bitstr_t *b, int n, int seed) +{ + bitoff_t value = -1; + bitoff_t bit; + int cnt = 0; + + _assert_bitstr_valid(b); + assert(n > 0 && n <= _bitstr_bits(b)); + + if ((seed + n) >= _bitstr_bits(b)) + seed = _bitstr_bits(b); /* skip offset test, too small */ + + for (bit = seed; bit < _bitstr_bits(b); bit++) { /* start at offset */ + if (bit_test(b, bit)) { /* fail */ + cnt = 0; + } else { + cnt++; + if (cnt >= n) { + value = bit - (cnt - 1); + return value; + } + } + } + + cnt = 0; /* start at beginning */ + for (bit = 0; bit < _bitstr_bits(b); bit++) { + if (bit_test(b, bit)) { /* fail */ + if (bit >= seed) + break; + cnt = 0; + } else { + cnt++; + if (cnt >= n) { + value = bit - (cnt - 1); + return value; + } + } + } + + return -1; +} + /* Find the first n contiguous bits set in b. * b (IN) bitstring to search * n (IN) number of bits needed diff --git a/src/common/bitstring.h b/src/common/bitstring.h index e73661c3662d21e869a04fb9fcf1cffa644c0388..ca23991f3ff8ad0e3c70f0aed78d6094eeb6eab1 100644 --- a/src/common/bitstring.h +++ b/src/common/bitstring.h @@ -139,6 +139,7 @@ bitoff_t bit_ffs(bitstr_t *b); /* new */ bitoff_t bit_nffs(bitstr_t *b, int n); bitoff_t bit_nffc(bitstr_t *b, int n); +bitoff_t bit_noc(bitstr_t *b, int n, int seed); void bit_free(bitstr_t *b); bitstr_t *bit_realloc(bitstr_t *b, bitoff_t nbits); bitoff_t bit_size(bitstr_t *b); diff --git a/src/common/slurm_xlator.h b/src/common/slurm_xlator.h index 6cc9d5a210e092250e305fdf533af3b2e7e2aff4..442b53489830b001848d893793843e804d105dbc 100644 --- a/src/common/slurm_xlator.h +++ b/src/common/slurm_xlator.h @@ -89,6 +89,7 @@ #define bit_pick_cnt slurm_bit_pick_cnt #define bitfmt2int slurm_bitfmt2int #define bit_nffc slurm_bit_nffc +#define bit_noc slurm_bit_noc #define bit_nffs slurm_bit_nffs #define bit_unfmt slurm_bit_unfmt #define bit_copybits slurm_bit_copybits diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c index 882c25bb95c9bd2e8c9dfe4855912d47e4739b1c..119be580dffa023f5bce9f54f952aeead8c0e7f1 100644 --- a/src/plugins/switch/elan/qsw.c +++ b/src/plugins/switch/elan/qsw.c @@ -639,6 +639,7 @@ static int _alloc_hwcontext(bitstr_t *nodeset, uint32_t prognum, int num) { int new = -1; + static int seed = 0; assert(nodeset); if (qsw_internal_state) { @@ -661,8 +662,9 @@ _alloc_hwcontext(bitstr_t *nodeset, uint32_t prognum, int num) step_ctx_p->st_high); } list_iterator_destroy(iter); - bit = bit_nffc(busy_context, num); + bit = bit_noc(busy_context, num, seed); if (bit != -1) { + seed = bit + num; step_ctx_p = xmalloc(sizeof(struct step_ctx)); step_ctx_p->st_prognum = prognum; step_ctx_p->st_low = bit; diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index 2cad564dbb775c4555c4ace6a31e16b3dc0f1207..fad652f9a933739785afce0a826988490f9c27fa 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -221,7 +221,7 @@ _cancel_job_id (uint32_t job_id, uint16_t signal) for (i=0; i<MAX_CANCEL_RETRY; i++) { if (signal == (uint16_t)-1) { - verbose("Signal %u to job %u", SIGKILL, job_id); + verbose("Terminating job %u", SIGKILL, job_id); error_code = slurm_kill_job (job_id, SIGKILL, (uint16_t)opt.batch); } else { @@ -233,8 +233,9 @@ _cancel_job_id (uint32_t job_id, uint16_t signal) else error_code = slurm_signal_job (job_id, signal); } - if ((error_code == 0) || - (errno != ESLURM_TRANSITION_STATE_NO_UPDATE)) + if (error_code == 0 + || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE + && errno != ESLURM_JOB_PENDING)) break; verbose("Job is in transistional state, retrying"); sleep ( 5 + i ); @@ -256,18 +257,18 @@ _cancel_step_id (uint32_t job_id, uint32_t step_id, uint16_t signal) for (i=0; i<MAX_CANCEL_RETRY; i++) { if (signal == (uint16_t)-1) { - verbose("Signal %u to step %u.%u", - SIGKILL, job_id, step_id); - error_code = slurm_kill_job_step (job_id, step_id, - SIGKILL); + verbose("Terminating step %u.%u", + job_id, step_id); + error_code = slurm_terminate_job_step(job_id, step_id); } else { verbose("Signal %u to step %u.%u", signal, job_id, step_id); error_code = slurm_signal_job_step(job_id, step_id, signal); } - if ((error_code == 0) || - (errno != ESLURM_TRANSITION_STATE_NO_UPDATE)) + if (error_code == 0 + || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE + && errno != ESLURM_JOB_PENDING)) break; verbose("Job is in transistional state, retrying"); sleep ( 5 + i ); diff --git a/testsuite/expect/globals.example b/testsuite/expect/globals.example index abe85efd92aeb9a51f4da779de36fd0c65327076..c0e4c69bd214aa5a00664f00c082e5c9cc8aa606 100755 --- a/testsuite/expect/globals.example +++ b/testsuite/expect/globals.example @@ -5,7 +5,7 @@ # Copyright (C) 2002 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-2002-040. +# UCRL-CODE-217948. # # This file is part of SLURM, a resource management program. # For details, see <http://www.llnl.gov/linux/slurm/>. diff --git a/testsuite/expect/test10.10 b/testsuite/expect/test10.10 index 070622cb8e2a224d7f93caaeeb9f6ac7b790a62f..dc48c8631bb12c6ebc4c24887cd2bcc5fc744280 100755 --- a/testsuite/expect/test10.10 +++ b/testsuite/expect/test10.10 @@ -8,7 +8,7 @@ # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ -# Copyright (C) 2002 The Regents of the University of California. +# Copyright (C) 2002-2006 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Danny Auble <da@llnl.gov> # UCRL-CODE-217948. @@ -42,61 +42,63 @@ print_header $test_id # Check the the --noheader option in smap # in curses format. # - + +set timeout 10 spawn $smap --noheader expect { -re "ID J" { - set stuff [concat $stuff "1"] + set stuff [concat $stuff "1"] incr matches exp_continue } -re "OBID" { - set stuff [concat $stuff "2"] + set stuff [concat $stuff "2"] incr matches exp_continue } -re "PARTITION" { - set stuff [concat $stuff "3"] + set stuff [concat $stuff "3"] incr matches exp_continue } -re "BGL_BLOCK" { - set stuff [concat $stuff "4"] + set stuff [concat $stuff "4"] incr matches exp_continue } -re "USER" { - set stuff [concat $stuff "5"] + set stuff [concat $stuff "5"] incr matches exp_continue } -re "NAME" { - set stuff [concat $stuff "6"] + set stuff [concat $stuff "6"] incr matches exp_continue } -re "ST " { - set stuff [concat $stuff "7"] + set stuff [concat $stuff "7"] incr matches exp_continue } -re "TIME" { - set stuff [concat $stuff "8"] + set stuff [concat $stuff "8"] incr matches exp_continue } -re "NODES" { - set stuff [concat $stuff "9"] + set stuff [concat $stuff "9"] incr matches exp_continue } -re "NODELIST" { - set stuff [concat $stuff "10"] + set stuff [concat $stuff "10"] incr matches exp_continue } - -re "\n" { - send "q" + -re "200\[0-9]" { + exec sleep 1 + send "\n" exp_continue } timeout { @@ -109,8 +111,7 @@ expect { } if {$matches != 0} { - send_user $stuff - send_user "\nFAILURE: smap --noheader produces header\n" + send_user "\nFAILURE: smap --noheader produces header ($stuff)\n" set exit_code 1 } diff --git a/testsuite/expect/test10.7 b/testsuite/expect/test10.7 index ce2bc0d57523caea22d2e899aa8b0f21867f35bd..804a6e80060f2533ef02edd0afacd834a75d289b 100755 --- a/testsuite/expect/test10.7 +++ b/testsuite/expect/test10.7 @@ -8,7 +8,7 @@ # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ -# Copyright (C) 2002 The Regents of the University of California. +# Copyright (C) 2002-2006 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Danny Auble <da@llnl.gov> # UCRL-CODE-217948. diff --git a/testsuite/expect/test10.9 b/testsuite/expect/test10.9 index 2d2eb5cea04a2e09367db3d2137348c9db58efab..09cee9b13102a21957e3fb9269d933edd4879e8e 100755 --- a/testsuite/expect/test10.9 +++ b/testsuite/expect/test10.9 @@ -8,7 +8,7 @@ # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ -# Copyright (C) 2002 The Regents of the University of California. +# Copyright (C) 2002-2006 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Danny Auble <da@llnl.gov> # UCRL-CODE-217948. @@ -44,19 +44,24 @@ print_header $test_id # Check the smap --iterate option # in curses format. # - +set timeout 10 spawn $smap -i 1 expect { -re "Screen is too small" { set too_small 1 exp_continue } - -re "JOBID" { - set stuff [concat $stuff "2"] + -re "NODELIST" { incr matches - if { $matches > 2 } { - send "q" - } + if { $matches == 1 } { + send "s" + } + if { $matches == 2 } { + send "j" + } + if { $matches >= 3 } { + send "\n" + } exp_continue } @@ -73,7 +78,7 @@ if {$too_small != 0} { exit 0 } if {$matches <= 2} { - send_user "\nFAILURE: smap --iterate failed\n" + send_user "\nFAILURE: smap --iterate failed ($matches)\n" set exit_code 1 } diff --git a/testsuite/expect/test6.3 b/testsuite/expect/test6.3 index 2ce113c6beb6dfbe8eb93bde310e9ffc36ad2427..a2b0ce758460d120a6d673a889471711edd02382 100755 --- a/testsuite/expect/test6.3 +++ b/testsuite/expect/test6.3 @@ -45,7 +45,7 @@ print_header $test_id # exec $bin_rm -f $file_in exec echo "#!$bin_bash" >$file_in -exec echo "$srun $bin_sleep 10" >>$file_in +exec echo "$srun $bin_sleep 600" >>$file_in exec $bin_chmod 700 $file_in # @@ -71,7 +71,7 @@ if {$job_id1 == 0} { send_user "\nFAILURE: job submit failure\n" exit 1 } -exec $bin_rm -f $file_in +#exec $bin_rm -f $file_in # # Test interactive scancel diff --git a/testsuite/expect/test6.5 b/testsuite/expect/test6.5 index c0a969f91312e1101df54c9749a750309c2c3f97..00865fcc499f93a02dc971bc9d989bb0ff1c4651 100755 --- a/testsuite/expect/test6.5 +++ b/testsuite/expect/test6.5 @@ -52,6 +52,7 @@ exec $bin_chmod 700 $file_in # # Submit a couple of jobs so we have something to work with # +set timeout 10 spawn $srun --batch --output=/dev/null --error=/dev/null -t1 $file_in expect { -re "jobid ($number) submitted" { @@ -101,7 +102,7 @@ exec $bin_rm -f $file_in spawn $scancel --verbose $job_id1 expect { - -re "Signal 9 to job" { + -re "Terminating job" { incr matches exp_continue } @@ -120,7 +121,7 @@ if {$matches != 1} { set matches 0 spawn $scancel $job_id2 expect { - -re "Signal 9 to job" { + -re "Terminating job" { incr matches exp_continue } diff --git a/testsuite/expect/test6.7 b/testsuite/expect/test6.7 index 166a38bcbb74e4d78f6d3766d55aa16e01af370b..8ed2a1fbd3053aed19ea8a66473f7afee6c84b17 100755 --- a/testsuite/expect/test6.7 +++ b/testsuite/expect/test6.7 @@ -104,7 +104,7 @@ if {$matches != 1} { set matches 0 spawn $scancel --verbose $job_id1 expect { - -re "Signal 9 to job" { + -re "Terminating job" { incr matches exp_continue }