diff --git a/doc/man/man1/scancel.1 b/doc/man/man1/scancel.1 index 277c4c7d6b22fca4d91d62894f460d08c05ffcb6..058d17013481b8257a54cd68749eaeb08055ec4d 100644 --- a/doc/man/man1/scancel.1 +++ b/doc/man/man1/scancel.1 @@ -1,4 +1,4 @@ -.TH SCANCEL "1" "April 2009" "scancel 2.0" "Slurm components" +.TH SCANCEL "1" "January 2011" "scancel 2.3" "Slurm components" .SH "NAME" scancel \- Used to signal jobs or job steps that are under the control of Slurm. @@ -33,6 +33,8 @@ for details. \fB-\-ctld\fR Send the job signal request to the slurmctld daemon rather than directly to the slurmd daemons. This increases overhead, but offers better fault tolerance. +This is the default behavior on architectures using front end nodes (e.g. +BlueGene and Cray computers) or when the \fB\-\-clusters\fR option is used. .TP \fB\-\-help\fR @@ -215,7 +217,7 @@ scancel \-\-state=PENDING \-\-user=bob \-\-partition=debug .SH "COPYING" Copyright (C) 2002-2007 The Regents of the University of California. -Copyright (C) 2008-2009 Lawrence Livermore National Security. +Copyright (C) 2008-2011 Lawrence Livermore National Security. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). CODE\-OCEC\-09\-009. All rights reserved. .LP diff --git a/src/common/read_config.c b/src/common/read_config.c index 927b402c02f6ba9d76810fa6efe5d9d6fb6fc40b..67e032eaa9530541d779e206f5e963773722a72a 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -472,6 +472,7 @@ static int _parse_frontend(void **dest, slurm_parser_enum_t type, *dest = (void *)n; + s_p_hashtbl_destroy(tbl); return 1; } diff --git a/src/scontrol/update_node.c b/src/scontrol/update_node.c index f453e76a613e36284f76497eac17c5efe42901fa..ddd55497d62e88e1b2faf6b01239fd46efb4f705 100644 --- a/src/scontrol/update_node.c +++ b/src/scontrol/update_node.c @@ -309,6 +309,13 @@ scontrol_update_front_end (int argc, char *argv[]) } } + if ((front_end_msg.node_state == NODE_STATE_DOWN) && + ((front_end_msg.reason == NULL) || + (strlen(front_end_msg.reason) == 0))) { + fprintf (stderr, "You must specify a reason when DOWNING a " + "frontend node\nRequest aborted\n"); + goto done; + } if ((front_end_msg.node_state == NODE_STATE_DRAIN) && ((front_end_msg.reason == NULL) || (strlen(front_end_msg.reason) == 0))) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b8dddb16e0973385b974514c7ee7f2fe26d7c1c3..3553f070c73089b9dde194d8ad126606962e6602 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1973,6 +1973,7 @@ extern int kill_job_by_part_name(char *part_name) * resource for its jobs and kill them. * IN node_name - name of a front end node * RET number of jobs associated with this front end node + * NOTE: Patterned after kill_running_job_by_node_name() */ extern int kill_job_by_front_end_name(char *node_name) { @@ -7736,7 +7737,8 @@ abort_job_on_node(uint32_t job_id, struct job_record *job_ptr, #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); agent_info->hostlist = hostlist_create(job_ptr->batch_host); - debug("Aborting job %u on node %s", job_id, job_ptr->batch_host); + debug("Aborting job %u on front end node %s", job_id, + job_ptr->batch_host); #else agent_info->hostlist = hostlist_create(node_ptr->name); debug("Aborting job %u on node %s", job_id, node_ptr->name); @@ -7781,7 +7783,8 @@ kill_job_on_node(uint32_t job_id, struct job_record *job_ptr, #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); agent_info->hostlist = hostlist_create(job_ptr->batch_host); - debug("Killing job %u on node %s", job_id, job_ptr->batch_host); + debug("Killing job %u on front end node %s", job_id, + job_ptr->batch_host); #else agent_info->hostlist = hostlist_create(node_ptr->name); debug("Killing job %u on node %s", job_id, node_ptr->name); @@ -7966,7 +7969,6 @@ _xmit_new_end_time(struct job_record *job_ptr) hostlist_push(agent_args->hostlist, job_ptr->batch_host); agent_args->node_count = 1; #else - agent_args->hostlist = hostlist_create(""); for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index c3816de90046f21f7ed66ea5b5d69700fb1fc123..76702a95b0ab6ccba8e195c8694a96fdf12e20e0 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1118,15 +1118,15 @@ static void _slurm_rpc_dump_nodes(slurm_msg_t * msg) /* Locks: Read config, write node (reset allocated CPU count in some * select plugins) */ slurmctld_lock_t node_write_lock = { - READ_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK }; + READ_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); START_TIMER; debug2("Processing RPC: REQUEST_NODE_INFO from uid=%d", uid); lock_slurmctld(node_write_lock); - if ((slurmctld_conf.private_data & PRIVATE_DATA_NODES) - && (!validate_operator(uid))) { + if ((slurmctld_conf.private_data & PRIVATE_DATA_NODES) && + (!validate_operator(uid))) { unlock_slurmctld(node_write_lock); error("Security violation, REQUEST_NODE_INFO RPC from uid=%d", uid); @@ -1435,8 +1435,8 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) lock_slurmctld(job_write_lock); /* Send batch step info to accounting */ - if (association_based_accounting - && (job_ptr = find_job_record(comp_msg->job_id))) { + if (association_based_accounting && + (job_ptr = find_job_record(comp_msg->job_id))) { struct step_record batch_step; memset(&batch_step, 0, sizeof(struct step_record)); batch_step.job_ptr = job_ptr;