diff --git a/doc/html/taskplugins.shtml b/doc/html/taskplugins.shtml index 4599bb8beb1fa602fa3875f1f535ab5e22992197..bcc3d0f1eb424426d32d4e7664ae2c62c4a01370 100644 --- a/doc/html/taskplugins.shtml +++ b/doc/html/taskplugins.shtml @@ -84,6 +84,28 @@ ID of the node on which the resources are being acquired On failure, the plugin should return SLURM_ERROR and set the errno to an appropriate value to indicate the reason for failure.</p> +<p class="commandline">int task_slurmd_suspend_job (uint32_t job_id);</p> +<p style="margin-left:.2in"><b>Description</b>: Temporarily release resources +previously reserved for a job. +Executed by the <b>slurmd</b> daemon as user root.</p> +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_id</span> (input) +ID of the job which is being suspended.</p> +<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. +On failure, the plugin should return SLURM_ERROR and set the errno to an +appropriate value to indicate the reason for failure.</p> + +<p class="commandline">int task_slurmd_resume_job (uint32_t job_id);</p> +<p style="margin-left:.2in"><b>Description</b>: Reclaim resources which +were previously released using the task_slurmd_suspend_job function. +Executed by the <b>slurmd</b> daemon as user root.</p> +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_id</span> (input) +ID of the job which is being resumed.</p> +<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. +On failure, the plugin should return SLURM_ERROR and set the errno to an +appropriate value to indicate the reason for failure.</p> + <p class="commandline">int task_slurmd_release_resources (uint32_t job_id);</p> <p style="margin-left:.2in"><b>Description</b>: Release resources previously reserved for a job. Executed by the <b>slurmd</b> daemon as user root.</p> @@ -142,6 +164,6 @@ appropriate value to indicate the reason for failure.</p> Future releases of SLURM may revise this API.</p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 27 March 2007</p> +<p style="text-align:center;">Last modified 28 May 2008</p> <!--#include virtual="footer.txt"--> diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c index 03d86c0873cd0fa4cfd23ec75942f0808d926723..9a725dc1afcfde72ffe89d2a1dea3a7e9f0600d0 100644 --- a/src/plugins/task/affinity/dist_tasks.c +++ b/src/plugins/task/affinity/dist_tasks.c @@ -35,13 +35,12 @@ #include "src/plugins/task/affinity/dist_tasks.h" static slurm_lllp_ctx_t *lllp_ctx = NULL; /* binding context */ -static struct node_gids *lllp_tasks = NULL; /* Keep track of the task count for - * logical processors - * socket/core/thread. - */ -static uint32_t lllp_reserved_size = 0;/* size of lllp reserved array */ -static uint32_t *lllp_reserved = NULL; /* count of Reserved lllps (socket, - * core, threads) */ +static struct node_gids *lllp_tasks = NULL; /* Keep track of the task count + * for logical processors + * socket/core/thread. */ +static uint32_t lllp_reserved_size = 0; /* lllp reserved array size */ +static uint32_t *lllp_reserved = NULL; /* count of Reserved lllps + * (socket, core, threads) */ static void _task_layout_display_masks(launch_tasks_request_msg_t *req, diff --git a/src/plugins/task/affinity/task_affinity.c b/src/plugins/task/affinity/task_affinity.c index 054c753956d24f57a4077c81f98baf6951b1310b..466622d180b4616f9dd98ba350a4844b77318d1a 100644 --- a/src/plugins/task/affinity/task_affinity.c +++ b/src/plugins/task/affinity/task_affinity.c @@ -2,11 +2,11 @@ * task_affinity.c - Library for task pre-launch and post_termination * functions for task affinity support ***************************************************************************** - * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005-2008 Hewlett-Packard Development Company, L.P. * Modified by Hewlett-Packard for task affinity support using task_none.c - * Copyright (C) 2005 The Regents of the University of California and + * Copyright (C) 2005-2007 The Regents of the University of California + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * task_none.c Written by Morris Jette <jette1@llnl.gov>. * LLNL-CODE-402394. * * This file is part of SLURM, a resource management program. @@ -84,7 +84,7 @@ const uint32_t plugin_version = 100; * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ -int init ( void ) +extern int init (void) { lllp_ctx_alloc(); verbose("%s loaded", plugin_name); @@ -95,7 +95,7 @@ int init ( void ) * fini() is called when the plugin is removed. Clear any allocated * storage here. */ -int fini ( void ) +extern int fini (void) { lllp_ctx_destroy(); verbose("%s unloaded", plugin_name); @@ -195,8 +195,9 @@ static void _update_bind_type(launch_tasks_request_msg_t *req) /* * task_slurmd_launch_request() */ -int task_slurmd_launch_request (uint32_t job_id, - launch_tasks_request_msg_t *req, uint32_t node_id) +extern int task_slurmd_launch_request (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id) { int hw_sockets, hw_cores, hw_threads; char buf_type[100]; @@ -230,18 +231,37 @@ int task_slurmd_launch_request (uint32_t job_id, /* * task_slurmd_reserve_resources() */ -int task_slurmd_reserve_resources ( uint32_t job_id, - launch_tasks_request_msg_t *req, uint32_t node_id) +extern int task_slurmd_reserve_resources (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id) { debug("task_slurmd_reserve_resources: %u", job_id); cr_reserve_lllp(job_id, req, node_id); return SLURM_SUCCESS; } +/* + * task_slurmd_suspend_job() + */ +extern int task_slurmd_suspend_job (uint32_t job_id) +{ + debug("task_slurmd_suspend_job: %u", job_id); + return SLURM_SUCCESS; +} + +/* + * task_slurmd_resume_job() + */ +extern int task_slurmd_resume_job (uint32_t job_id) +{ + debug("task_slurmd_resume_job: %u", job_id); + return SLURM_SUCCESS; +} + /* * task_slurmd_release_resources() */ -int task_slurmd_release_resources ( uint32_t job_id ) +extern int task_slurmd_release_resources (uint32_t job_id) { debug("task_slurmd_release_resources: %u", job_id); cr_release_lllp(job_id); @@ -253,7 +273,7 @@ int task_slurmd_release_resources ( uint32_t job_id ) * user to launch his jobs. Use this to create the CPUSET directory * and set the owner appropriately. */ -int task_pre_setuid ( slurmd_job_t *job ) +extern int task_pre_setuid (slurmd_job_t *job) { char path[PATH_MAX]; @@ -274,7 +294,7 @@ int task_pre_setuid ( slurmd_job_t *job ) * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ -int task_pre_launch ( slurmd_job_t *job ) +extern int task_pre_launch (slurmd_job_t *job) { char base[PATH_MAX], path[PATH_MAX]; @@ -358,7 +378,7 @@ int task_pre_launch ( slurmd_job_t *job ) * It is preceeded by --task-epilog (from srun command line) * followed by TaskEpilog program (from slurm.conf). */ -int task_post_term ( slurmd_job_t *job ) +extern int task_post_term (slurmd_job_t *job) { debug("affinity task_post_term: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); diff --git a/src/plugins/task/none/task_none.c b/src/plugins/task/none/task_none.c index 177fce59ac02a225458e21e941654e775812652c..5e852eab6596b349b6eebdf1efb47719b34bb228 100644 --- a/src/plugins/task/none/task_none.c +++ b/src/plugins/task/none/task_none.c @@ -2,7 +2,8 @@ * task_none.c - Library for task pre-launch and post_termination functions * with no actions ***************************************************************************** - * Copyright (C) 2005 The Regents of the University of California. + * Copyright (C) 2005-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -82,7 +83,7 @@ const uint32_t plugin_version = 100; * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ -int init ( void ) +extern int init (void) { verbose("%s loaded", plugin_name); return SLURM_SUCCESS; @@ -92,7 +93,7 @@ int init ( void ) * fini() is called when the plugin is removed. Clear any allocated * storage here. */ -int fini ( void ) +extern int fini (void) { return SLURM_SUCCESS; } @@ -100,30 +101,49 @@ int fini ( void ) /* * task_slurmd_launch_request() */ -int task_slurmd_launch_request ( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id) +extern int task_slurmd_launch_request (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id) { - debug("task_slurmd_launch_request: %u %u", - job_id, node_id); + debug("task_slurmd_launch_request: %u %u", job_id, node_id); return SLURM_SUCCESS; } /* * task_slurmd_reserve_resources() */ -int task_slurmd_reserve_resources ( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id ) +extern int task_slurmd_reserve_resources (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id) { - debug("task_slurmd_reserve_resources: %u %u", - job_id, node_id); + debug("task_slurmd_reserve_resources: %u %u", job_id, node_id); + return SLURM_SUCCESS; +} + +/* + * task_slurmd_suspend_job() + */ +extern int task_slurmd_suspend_job (uint32_t job_id) +{ + debug("task_slurmd_suspend_job: %u", job_id); + return SLURM_SUCCESS; +} + +/* + * task_slurmd_resume_job() + */ +extern int task_slurmd_resume_job (uint32_t job_id) +{ + debug("task_slurmd_resume_job: %u", job_id); return SLURM_SUCCESS; } /* * task_slurmd_release_resources() */ -int task_slurmd_release_resources ( uint32_t job_id ) +extern int task_slurmd_release_resources (uint32_t job_id) { - debug("task_slurmd_release_resources: %u", - job_id); + debug("task_slurmd_release_resources: %u", job_id); return SLURM_SUCCESS; } @@ -132,7 +152,7 @@ int task_slurmd_release_resources ( uint32_t job_id ) * user to launch his jobs. Use this to create the CPUSET directory * and set the owner appropriately. */ -int task_pre_setuid ( slurmd_job_t *job ) +extern int task_pre_setuid (slurmd_job_t *job) { return SLURM_SUCCESS; } @@ -142,7 +162,7 @@ int task_pre_setuid ( slurmd_job_t *job ) * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ -int task_pre_launch ( slurmd_job_t *job ) +extern int task_pre_launch (slurmd_job_t *job) { debug("task_pre_launch: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); @@ -154,7 +174,7 @@ int task_pre_launch ( slurmd_job_t *job ) * It is preceeded by --task-epilog (from srun command line) * followed by TaskEpilog program (from slurm.conf). */ -int task_post_term ( slurmd_job_t *job ) +extern int task_post_term (slurmd_job_t *job) { debug("task_post_term: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); diff --git a/src/slurmd/common/task_plugin.c b/src/slurmd/common/task_plugin.c index 1a7505c017507dbfd76aa90209ffb1baba045054..1882a9b0555a1044a77eff929214fbcdb7330bc2 100644 --- a/src/slurmd/common/task_plugin.c +++ b/src/slurmd/common/task_plugin.c @@ -1,7 +1,8 @@ /*****************************************************************************\ * task_plugin.h - task launch plugin stub. ***************************************************************************** - * Copyright (C) 2005 The Regents of the University of California. + * Copyright (C) 2005-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -46,13 +47,19 @@ #include "src/slurmd/slurmstepd/slurmstepd_job.h" typedef struct slurmd_task_ops { - int (*slurmd_launch_request) ( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id); - int (*slurmd_reserve_resources) ( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id ); - int (*slurmd_release_resources) ( uint32_t job_id); - - int (*pre_setuid) ( slurmd_job_t *job ); - int (*pre_launch) ( slurmd_job_t *job ); - int (*post_term) ( slurmd_job_t *job ); + int (*slurmd_launch_request) (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id); + int (*slurmd_reserve_resources) (uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id); + int (*slurmd_suspend_job) (uint32_t job_id); + int (*slurmd_resume_job) (uint32_t job_id); + int (*slurmd_release_resources) (uint32_t job_id); + + int (*pre_setuid) (slurmd_job_t *job); + int (*pre_launch) (slurmd_job_t *job); + int (*post_term) (slurmd_job_t *job); } slurmd_task_ops_t; @@ -68,7 +75,7 @@ static pthread_mutex_t g_task_context_lock = PTHREAD_MUTEX_INITIALIZER; static slurmd_task_ops_t * -_slurmd_task_get_ops( slurmd_task_context_t *c ) +_slurmd_task_get_ops(slurmd_task_context_t *c) { /* * Must be synchronized with slurmd_task_ops_t above. @@ -76,6 +83,8 @@ _slurmd_task_get_ops( slurmd_task_context_t *c ) static const char *syms[] = { "task_slurmd_launch_request", "task_slurmd_reserve_resources", + "task_slurmd_suspend_job", + "task_slurmd_resume_job", "task_slurmd_release_resources", "task_pre_setuid", "task_pre_launch", @@ -118,7 +127,7 @@ _slurmd_task_get_ops( slurmd_task_context_t *c ) static slurmd_task_context_t * -_slurmd_task_context_create( const char *task_plugin_type ) +_slurmd_task_context_create(const char *task_plugin_type) { slurmd_task_context_t *c; @@ -137,7 +146,7 @@ _slurmd_task_context_create( const char *task_plugin_type ) static int -_slurmd_task_context_destroy( slurmd_task_context_t *c ) +_slurmd_task_context_destroy(slurmd_task_context_t *c) { /* * Must check return code here because plugins might still @@ -161,7 +170,7 @@ _slurmd_task_context_destroy( slurmd_task_context_t *c ) * * RET - slurm error code */ -extern int slurmd_task_init( void ) +extern int slurmd_task_init(void) { int retval = SLURM_SUCCESS; char *task_plugin_type = NULL; @@ -198,7 +207,7 @@ extern int slurmd_task_init( void ) * * RET - slurm error code */ -extern int slurmd_task_fini( void ) +extern int slurmd_task_fini(void) { int rc; @@ -215,9 +224,11 @@ extern int slurmd_task_fini( void ) * * RET - slurm error code */ -extern int slurmd_launch_request( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id) +extern int slurmd_launch_request(uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.slurmd_launch_request))(job_id, req, node_id); @@ -228,22 +239,50 @@ extern int slurmd_launch_request( uint32_t job_id, launch_tasks_request_msg_t *r * * RET - slurm error code */ -extern int slurmd_reserve_resources( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id ) +extern int slurmd_reserve_resources(uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id ) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.slurmd_reserve_resources))(job_id, req, node_id); } +/* + * Slurmd is suspending a job. + * + * RET - slurm error code + */ +extern int slurmd_suspend_job(uint32_t job_id) +{ + if (slurmd_task_init()) + return SLURM_ERROR; + + return (*(g_task_context->ops.slurmd_suspend_job))(job_id); +} + +/* + * Slurmd is resuming a previously suspended job. + * + * RET - slurm error code + */ +extern int slurmd_resume_job(uint32_t job_id) +{ + if (slurmd_task_init()) + return SLURM_ERROR; + + return (*(g_task_context->ops.slurmd_resume_job))(job_id); +} + /* * Slurmd is releasing resources for the task. * * RET - slurm error code */ -extern int slurmd_release_resources( uint32_t job_id ) +extern int slurmd_release_resources(uint32_t job_id) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.slurmd_release_resources))(job_id); @@ -255,9 +294,9 @@ extern int slurmd_release_resources( uint32_t job_id ) * * RET - slurm error code */ -extern int pre_setuid( slurmd_job_t *job ) +extern int pre_setuid(slurmd_job_t *job) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.pre_setuid))(job); @@ -268,9 +307,9 @@ extern int pre_setuid( slurmd_job_t *job ) * * RET - slurm error code */ -extern int pre_launch( slurmd_job_t *job ) +extern int pre_launch(slurmd_job_t *job) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.pre_launch))(job); @@ -281,9 +320,9 @@ extern int pre_launch( slurmd_job_t *job ) * * RET - slurm error code */ -extern int post_term( slurmd_job_t *job ) +extern int post_term(slurmd_job_t *job) { - if ( slurmd_task_init() ) + if (slurmd_task_init()) return SLURM_ERROR; return (*(g_task_context->ops.post_term))(job); diff --git a/src/slurmd/common/task_plugin.h b/src/slurmd/common/task_plugin.h index 4cca516ac64fc08cc2e0edbf0ad1689285ebd39e..121876569de2e577f7b4507ec9fd0ce4b82e6529 100644 --- a/src/slurmd/common/task_plugin.h +++ b/src/slurmd/common/task_plugin.h @@ -1,7 +1,8 @@ /*****************************************************************************\ * task_plugin.h - Define plugin functions for task pre_launch and post_term. ***************************************************************************** - * Copyright (C) 2005 The Regents of the University of California. + * Copyright (C) 2005-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -65,21 +66,39 @@ extern int slurmd_task_fini(void); * * RET - slurm error code */ -extern int slurmd_launch_request( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id ); +extern int slurmd_launch_request(uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id ); /* * Slurmd is reserving resources for the task. * * RET - slurm error code */ -extern int slurmd_reserve_resources( uint32_t job_id, launch_tasks_request_msg_t *req, uint32_t node_id ); +extern int slurmd_reserve_resources(uint32_t job_id, + launch_tasks_request_msg_t *req, + uint32_t node_id ); + +/* + * Slurmd is suspending a job. + * + * RET - slurm error code + */ +extern int slurmd_suspend_job(uint32_t job_id); + +/* + * Slurmd is resuming a previously suspended job. + * + * RET - slurm error code + */ +extern int slurmd_resume_job(uint32_t job_id); /* * Slurmd is releasing resources for the task. * * RET - slurm error code */ -extern int slurmd_release_resources( uint32_t job_id ); +extern int slurmd_release_resources(uint32_t job_id); /* * Note that a task launch is about to occur. diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index c3c70a372dd1bcd6bdf105d2dceaf19dba2ca11e..6852e43f6db1ab7131c032a175a738aee85dda8d 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2390,6 +2390,12 @@ _rpc_suspend_job(slurm_msg_t *msg) sleep(1); } + /* Release or reclaim resources bound to these tasks (task affinity) */ + if (req->op == SUSPEND_JOB) + (void) slurmd_suspend_job(req->job_id); + else + (void) slurmd_resume_job(req->job_id); + /* * Loop through all job steps and call stepd_suspend or stepd_resume * as appropriate. Since the "suspend" action contains a 'sleep 1',