From 917f8bb298c409e380fb90a621e3059d35a08827 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 3 Jan 2006 17:01:03 +0000 Subject: [PATCH] Send REQUEST_SUSPEND RPC to slurmd rather than signal RPCs. --- src/slurmctld/agent.c | 5 ++- src/slurmctld/job_mgr.c | 55 ++++++++++++++++++++--- src/slurmd/slurmd/req.c | 96 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 148 insertions(+), 8 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 8fc618d77f7..12ca88d199d 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -4,9 +4,9 @@ * * $Id$ ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-5 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette@llnl.gov>, et. al. + * Written by Morris Jette <jette1@llnl.gov>, et. al. * Derived from pdsh written by Jim Garlick <garlick1@llnl.gov> * UCRL-CODE-2002-040. * @@ -313,6 +313,7 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr) (agent_arg_ptr->msg_type == REQUEST_PING) || (agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) || (agent_arg_ptr->msg_type == REQUEST_SHUTDOWN) || + (agent_arg_ptr->msg_type == REQUEST_SUSPEND) || (agent_arg_ptr->msg_type == REQUEST_RECONFIGURE) || (agent_arg_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) || (agent_arg_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS)); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0b1f9be6ab7..be9b1e9d305 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -5,7 +5,7 @@ * * $Id$ ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-5 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-2002-040. @@ -129,6 +129,7 @@ static void _set_job_id(struct job_record *job_ptr); static void _set_job_prio(struct job_record *job_ptr); static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal); static void _signal_job(struct job_record *job_ptr, int signal); +static void _suspend_job(struct job_record *job_ptr, uint16_t op); static int _suspend_job_nodes(struct job_record *job_ptr); static bool _top_priority(struct job_record *job_ptr); static int _validate_job_create_req(job_desc_msg_t * job_desc); @@ -3934,6 +3935,52 @@ static void _signal_job(struct job_record *job_ptr, int signal) return; } +/* Send suspend request to slumrd of all nodes associated with a job */ +static void _suspend_job(struct job_record *job_ptr, uint16_t op) +{ + agent_arg_t *agent_args; + suspend_msg_t *sus_ptr; + int i, buf_rec_size = 0; + + agent_args = xmalloc(sizeof(agent_arg_t)); + agent_args->msg_type = REQUEST_SUSPEND; + agent_args->retry = 1; + sus_ptr = xmalloc(sizeof(suspend_msg_t)); + sus_ptr->job_id = job_ptr->job_id; + sus_ptr->op = op; + + for (i = 0; i < node_record_count; i++) { + if (bit_test(job_ptr->node_bitmap, i) == 0) + continue; + if ((agent_args->node_count + 1) > buf_rec_size) { + buf_rec_size += 128; + xrealloc((agent_args->slurm_addr), + (sizeof(struct sockaddr_in) * + buf_rec_size)); + xrealloc((agent_args->node_names), + (MAX_NAME_LEN * buf_rec_size)); + } + agent_args->slurm_addr[agent_args->node_count] = + node_record_table_ptr[i].slurm_addr; + strncpy(&agent_args-> + node_names[MAX_NAME_LEN * agent_args->node_count], + node_record_table_ptr[i].name, MAX_NAME_LEN); + agent_args->node_count++; +#ifdef HAVE_FRONT_END /* Operate only on front-end */ + break; +#endif + } + + if (agent_args->node_count == 0) { + xfree(sus_ptr); + xfree(agent_args); + return; + } + + agent_args->msg_args = sus_ptr; + agent_queue_request(agent_args); + return; +} /* Specified job is being suspended, release allocated nodes */ static int _suspend_job_nodes(struct job_record *job_ptr) { @@ -4085,8 +4132,7 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, rc = _suspend_job_nodes(job_ptr); if (rc != SLURM_SUCCESS) goto reply; - _signal_batch_job(job_ptr, SIGSTOP); - _signal_job(job_ptr, SIGSTOP); + _suspend_job(job_ptr, sus_ptr->op); job_ptr->job_state = JOB_SUSPENDED; if (job_ptr->suspend_time) { job_ptr->pre_sus_time += @@ -4105,8 +4151,7 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, rc = _resume_job_nodes(job_ptr); if (rc != SLURM_SUCCESS) goto reply; - _signal_job(job_ptr, SIGCONT); - _signal_batch_job(job_ptr, SIGCONT); + _suspend_job(job_ptr, sus_ptr->op); job_ptr->job_state = JOB_RUNNING; if (job_ptr->time_limit != INFINITE) { /* adjust effective time_limit */ diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 420c0ca58a9..e6262afa6ae 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2,7 +2,7 @@ * src/slurmd/slurmd/req.c - slurmd request handling * $Id$ ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-5 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <mgrondona@llnl.gov>. * UCRL-CODE-2002-040. @@ -79,6 +79,7 @@ static void _rpc_terminate_tasks(slurm_msg_t *, slurm_addr *); static void _rpc_timelimit(slurm_msg_t *, slurm_addr *); static void _rpc_reattach_tasks(slurm_msg_t *, slurm_addr *); static void _rpc_signal_job(slurm_msg_t *, slurm_addr *); +static void _rpc_suspend_job(slurm_msg_t *, slurm_addr *); static void _rpc_terminate_job(slurm_msg_t *, slurm_addr *); static void _rpc_update_time(slurm_msg_t *, slurm_addr *); static void _rpc_shutdown(slurm_msg_t *msg, slurm_addr *cli_addr); @@ -152,6 +153,10 @@ slurmd_req(slurm_msg_t *msg, slurm_addr *cli) _rpc_signal_job(msg, cli); slurm_free_signal_job_msg(msg->data); break; + case REQUEST_SUSPEND: + _rpc_suspend_job(msg, cli); + slurm_free_suspend_msg(msg->data); + break; case REQUEST_TERMINATE_JOB: debug2("Processing RPC: REQUEST_TERMINATE_JOB"); _rpc_terminate_job(msg, cli); @@ -1425,6 +1430,95 @@ _rpc_signal_job(slurm_msg_t *msg, slurm_addr *cli) } } +/* + * Send a job suspend/resume request through the appropriate slurmstepds for + * each job step belonging to a given job allocation. + */ +static void +_rpc_suspend_job(slurm_msg_t *msg, slurm_addr *cli) +{ + suspend_msg_t *req = msg->data; + uid_t req_uid = g_slurm_auth_get_uid(msg->cred); + uid_t job_uid; + List steps; + ListIterator i; + step_loc_t *stepd; + int step_cnt = 0; + int fd, rc = SLURM_SUCCESS, sig_num; + char *sig_name; + + if (req->op == SUSPEND_JOB) { + sig_name = "STOP"; + sig_num = SIGSTOP; + } else if (req->op == RESUME_JOB) { + sig_name = "CONT"; + sig_num = SIGCONT; + } else { + error("REQUEST_SUSPEND: bad op code %u", + req->op); + rc = ESLURM_NOT_SUPPORTED; + goto fini; + } + debug("_rpc_suspend_job jobid=%u uid=%d signal=%s", + req->job_id, req_uid, sig_name); + job_uid = _get_job_uid(req->job_id); + /* + * check that requesting user ID is the SLURM UID + */ + if (!_slurm_authorized_user(req_uid)) { + error("Security violation: signal_job(%u) from uid %ld", + req->job_id, (long) req_uid); + rc = ESLURM_USER_ID_MISSING; + goto fini; + } + + /* + * Loop through all job steps for this job and signal the + * step's process group through the slurmstepd. + */ + steps = stepd_available(conf->spooldir, conf->node_name); + i = list_iterator_create(steps); + while (stepd = list_next(i)) { + if (stepd->jobid != req->job_id) { + /* multiple jobs expected on shared nodes */ + debug3("Step from other job: jobid=%u (this jobid=%u)", + stepd->jobid, req->job_id); + continue; + } + step_cnt++; + + fd = stepd_connect(stepd->directory, stepd->nodename, + stepd->jobid, stepd->stepid); + if (fd == -1) { + debug3("Unable to connect to step %u.%u", + stepd->jobid, stepd->stepid); + continue; + } + + debug2(" signal %d to job %u.%u", + sig_num, stepd->jobid, stepd->stepid); + if (stepd_signal(fd, sig_num) < 0) + debug("signal jobid=%u failed: %m", stepd->jobid); + close(fd); + } + list_iterator_destroy(i); + list_destroy(steps); + if (step_cnt == 0) + debug2("No steps in jobid %u to send signal %d", + req->job_id, sig_num); + + /* + * At this point, if connection still open, we send controller + * a reply. + */ + fini: if (msg->conn_fd >= 0) { + slurm_send_rc_msg(msg, rc); + if (slurm_close_accepted_conn(msg->conn_fd) < 0) + error ("_rpc_signal_job: close(%d): %m", msg->conn_fd); + msg->conn_fd = -1; + } +} + static void _rpc_terminate_job(slurm_msg_t *msg, slurm_addr *cli) { -- GitLab