Skip to content
Snippets Groups Projects
Commit a7930665 authored by Moe Jette's avatar Moe Jette
Browse files

Add logic for slurmctld to send launch response to srun whenever a

queued allocation request is satisfied. Srun just has a stub to catch
and log the message while using polling to notice the allocation has
been made.
parent f755c2f4
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,7 @@ sbin_PROGRAMS = slurmctld
slurmctld_LDADD = \
$(top_builddir)/src/common/libcommon.la \
$(top_builddir)/src/common/libhostlist.la \
$(top_builddir)/src/api/libslurm.la \
$(top_builddir)/src/common/libdaemonize.la
......
......@@ -58,6 +58,8 @@
#include <string.h>
#include <unistd.h>
#include <slurm/slurm.h>
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/macros.h"
......@@ -273,7 +275,8 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr)
(agent_arg_ptr->msg_type == REQUEST_PING) ||
(agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) ||
(agent_arg_ptr->msg_type == REQUEST_SHUTDOWN) ||
(agent_arg_ptr->msg_type == REQUEST_RECONFIGURE) ||
(agent_arg_ptr->msg_type == REQUEST_RECONFIGURE) ||
(agent_arg_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) ||
(agent_arg_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS));
if (agent_arg_ptr->node_count == 0)
......@@ -346,6 +349,7 @@ static void *_wdog(void *args)
if ( (agent_ptr->msg_type == SRUN_PING) ||
(agent_ptr->msg_type == SRUN_TIMEOUT) ||
(agent_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) ||
(agent_ptr->msg_type == SRUN_NODE_FAIL) )
srun_agent = true;
......@@ -426,6 +430,11 @@ static void _notify_slurmctld_jobs(agent_info_t *agent_ptr)
srun_node_fail_msg_t *msg = *agent_ptr->msg_args_pptr;
job_id = msg->job_id;
step_id = msg->step_id;
} else if (agent_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) {
resource_allocation_response_msg_t *msg =
*agent_ptr->msg_args_pptr;
job_id = msg->job_id;
step_id = NO_VAL;
} else {
error("_notify_slurmctld_jobs invalid msg_type %u",
agent_ptr->msg_type);
......@@ -541,6 +550,7 @@ static void *_thread_per_node_rpc(void *args)
(msg_type == REQUEST_KILL_JOB) );
srun_agent = ( (msg_type == SRUN_PING) ||
(msg_type == SRUN_TIMEOUT) ||
(msg_type == RESPONSE_RESOURCE_ALLOCATION) ||
(msg_type == SRUN_NODE_FAIL) );
/* send request message */
......@@ -855,8 +865,12 @@ static void _purge_agent_args(agent_arg_t *agent_arg_ptr)
if (agent_arg_ptr->msg_args) {
if (agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH)
_slurmctld_free_job_launch_msg(agent_arg_ptr->msg_args);
else if (agent_arg_ptr->msg_type ==
RESPONSE_RESOURCE_ALLOCATION)
slurm_free_resource_allocation_response_msg(
agent_arg_ptr->msg_args);
else
xfree(agent_arg_ptr->msg_args);
}
}
xfree(agent_arg_ptr);
}
......@@ -42,6 +42,7 @@
#include "src/slurmctld/agent.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/srun_comm.h"
#define MAX_RETRIES 10
......@@ -184,6 +185,7 @@ int schedule(void)
last_job_update = time(NULL);
info("schedule: JobId=%u NodeList=%s",
job_ptr->job_id, job_ptr->nodes);
srun_allocate(job_ptr->job_id);
if (job_ptr->batch_flag)
_launch_job(job_ptr);
job_cnt++;
......
......@@ -36,6 +36,10 @@
#include "src/slurmctld/agent.h"
#include "src/slurmctld/slurmctld.h"
/* Launch the srun request. Note that retry is always zero since
* we don't want to clog the system up with messages destined for
* defunct srun processes
*/
static void _srun_agent_launch(slurm_addr *addr, char *host,
slurm_msg_type_t type, void *msg_args)
{
......@@ -51,6 +55,44 @@ static void _srun_agent_launch(slurm_addr *addr, char *host,
agent_queue_request(agent_args);
}
/*
* srun_allocate - notify srun of a resource allocation
* IN job_id - id of the job allocated resource
*/
extern void srun_allocate (uint32_t job_id)
{
struct job_record *job_ptr = find_job_record (job_id);
xassert(job_ptr);
if (job_ptr->port && job_ptr->host && job_ptr->host[0]) {
slurm_addr * addr;
resource_allocation_response_msg_t *msg_arg;
addr = xmalloc(sizeof(struct sockaddr_in));
slurm_set_addr(addr, job_ptr->port, job_ptr->host);
msg_arg = xmalloc(sizeof(resource_allocation_response_msg_t));
msg_arg->job_id = job_ptr->job_id;
msg_arg->node_list = xstrdup(job_ptr->nodes);
msg_arg->num_cpu_groups = job_ptr->num_cpu_groups;
msg_arg->cpus_per_node = xmalloc(sizeof(uint32_t) *
job_ptr->num_cpu_groups);
memcpy(msg_arg->cpus_per_node, job_ptr->cpus_per_node,
(sizeof(uint32_t) * job_ptr->num_cpu_groups));
msg_arg->cpu_count_reps = xmalloc(sizeof(uint32_t) *
job_ptr->num_cpu_groups);
memcpy(msg_arg->cpu_count_reps, job_ptr->cpu_count_reps,
(sizeof(uint32_t) * job_ptr->num_cpu_groups));
msg_arg->node_cnt = job_ptr->node_cnt;
msg_arg->node_addr = xmalloc(sizeof (slurm_addr) *
job_ptr->node_cnt);
memcpy(msg_arg->node_addr, job_ptr->node_addr,
(sizeof(slurm_addr) * job_ptr->node_cnt));
msg_arg->error_code = SLURM_SUCCESS;
_srun_agent_launch(addr, job_ptr->host,
RESPONSE_RESOURCE_ALLOCATION, msg_arg);
}
}
/*
* srun_node_fail - notify srun of a node's failure
* IN job_id - id of job to notify
......@@ -81,7 +123,8 @@ extern void srun_node_fail (uint32_t job_id, char *node_name)
msg_arg->job_id = job_id;
msg_arg->step_id = NO_VAL;
msg_arg->nodelist = xstrdup(node_name);
_srun_agent_launch(addr, job_ptr->host, SRUN_NODE_FAIL, msg_arg);
_srun_agent_launch(addr, job_ptr->host, SRUN_NODE_FAIL,
msg_arg);
}
......@@ -184,7 +227,8 @@ extern void srun_timeout (uint32_t job_id, time_t timeout)
msg_arg->job_id = job_id;
msg_arg->step_id = NO_VAL;
msg_arg->timeout = timeout;
_srun_agent_launch(addr, job_ptr->host, SRUN_TIMEOUT, msg_arg);
_srun_agent_launch(addr, job_ptr->host, SRUN_TIMEOUT,
msg_arg);
}
......@@ -201,7 +245,8 @@ extern void srun_timeout (uint32_t job_id, time_t timeout)
msg_arg->job_id = job_ptr->job_id;
msg_arg->step_id = step_ptr->step_id;
msg_arg->timeout = timeout;
_srun_agent_launch(addr, step_ptr->host, SRUN_TIMEOUT, msg_arg);
_srun_agent_launch(addr, step_ptr->host, SRUN_TIMEOUT,
msg_arg);
}
list_iterator_destroy(step_record_iterator);
}
......@@ -225,3 +270,4 @@ extern void srun_response(uint32_t job_id, uint32_t step_id)
((step_ptr = find_step_record(job_ptr, (uint16_t) step_id))))
step_ptr->time_last_active = now;
}
......@@ -30,6 +30,12 @@
#include <sys/types.h>
#include <time.h>
/*
* srun_allocate - notify srun of a resource allocation
* IN job_id - id of the job allocated resource
*/
extern void srun_allocate (uint32_t job_id);
/*
* srun_node_fail - notify srun of a node's failure
* IN job_id - id of job to notify
......
......@@ -524,6 +524,11 @@ _handle_msg(job_t *job, slurm_msg_t *msg)
slurm_send_rc_msg(msg, SLURM_SUCCESS);
slurm_free_srun_node_fail_msg(msg->data);
break;
case RESPONSE_RESOURCE_ALLOCATION:
debug3("resource allocation response received");
slurm_send_rc_msg(msg, SLURM_SUCCESS);
slurm_free_resource_allocation_response_msg(msg->data);
break;
default:
error("received spurious message type: %d\n",
msg->msg_type);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment