Skip to content
Snippets Groups Projects
Commit a3104387 authored by Moe Jette's avatar Moe Jette
Browse files

Move plugin-specific error numbers and matching strings from common into

the plugins themselves. Add new functions to each plugin to return the
error number and given an error number, return a description.
parent 7da16150
No related branches found
No related tags found
No related merge requests found
Showing
with 375 additions and 92 deletions
...@@ -16,6 +16,10 @@ documents those changes that are of interest to users and admins. ...@@ -16,6 +16,10 @@ documents those changes that are of interest to users and admins.
-- Return errors when trying to change configuration parameters -- Return errors when trying to change configuration parameters
AuthType, SchedulerType, and SwitchType via "scontrol reconfig" AuthType, SchedulerType, and SwitchType via "scontrol reconfig"
or SIGHUP. Document how to safely change these parameters. or SIGHUP. Document how to safely change these parameters.
-- Plugin-specific error number definitions and descriptive strings
moved from common into plugin modules
-- Documentation for writing scheduler, switch, and job completion
logging plugins added
* Changes in SLURM 0.3.0.0-pre6 * Changes in SLURM 0.3.0.0-pre6
=============================== ===============================
......
...@@ -93,9 +93,9 @@ integer values should be used when appropriate. It is desirable that these value ...@@ -93,9 +93,9 @@ integer values should be used when appropriate. It is desirable that these value
be mapped into the range ESLURM_SWITCH_MIN and ESLURM_SWITCH_MAX be mapped into the range ESLURM_SWITCH_MIN and ESLURM_SWITCH_MAX
as defined in <span class="commandline">slurm/slurm_errno.h</span>. as defined in <span class="commandline">slurm/slurm_errno.h</span>.
The error number should be returned by the function The error number should be returned by the function
<a href="#get_errno"><span class="commandline">switch_get_errno()</span></a> <a href="#get_errno"><span class="commandline">switch_p_get_errno()</span></a>
and this error number can be converted to an appropriate string description using the and this error number can be converted to an appropriate string description using the
<a href="#strerror"><span class="commandline">switch_strerror()</span></a> <a href="#strerror"><span class="commandline">switch_p_strerror()</span></a>
function described below.</p> function described below.</p>
<p>These values must not be used as return values in integer-valued functions <p>These values must not be used as return values in integer-valued functions
...@@ -353,14 +353,14 @@ to indicate the reason for failure.</p> ...@@ -353,14 +353,14 @@ to indicate the reason for failure.</p>
<p class="footer"><a href="#top">top</a></p> <p class="footer"><a href="#top">top</a></p>
<h4>Error Handling Functions</h4> <h4>Error Handling Functions</h4>
<a name="get_errno"><p class="commandline">int switch_get_errno (void);</p></a> <a name="get_errno"><p class="commandline">int switch_p_get_errno (void);</p></a>
<p style="margin-left:.2in"><b>Description</b>: Return the number of a switch <p style="margin-left:.2in"><b>Description</b>: Return the number of a switch
specific error.</p> specific error.</p>
<p style="margin-left:.2in"><b>Arguments</b>: None</p> <p style="margin-left:.2in"><b>Arguments</b>: None</p>
<p style="margin-left:.2in"><b>Returns</b>: Error number for the last failure encountered by <p style="margin-left:.2in"><b>Returns</b>: Error number for the last failure encountered by
the switch plugin.</p> the switch plugin.</p>
<p class="commandline"><a name="strerror">char *switch_strerror(int errnum);</a></p> <p class="commandline"><a name="strerror">char *switch_p_strerror(int errnum);</a></p>
<p style="margin-left:.2in"><b>Description</b>: Return a string description of a switch <p style="margin-left:.2in"><b>Description</b>: Return a string description of a switch
specific error code.</p> specific error code.</p>
<p style="margin-left:.2in"><b>Arguments</b>: <p style="margin-left:.2in"><b>Arguments</b>:
......
...@@ -128,25 +128,14 @@ enum { ...@@ -128,25 +128,14 @@ enum {
ESLURM_INVALID_SCHEDTYPE_CHANGE, ESLURM_INVALID_SCHEDTYPE_CHANGE,
ESLURM_INVALID_SWITCHTYPE_CHANGE, ESLURM_INVALID_SWITCHTYPE_CHANGE,
/* Quadrics Elan routine error codes */ /* switch specific error codes, specific values defined in plugin module */
ENOSLURM = 3000, ESLURM_SWITCH_MIN = 3000,
EBADMAGIC_QSWLIBSTATE, ESLURM_SWITCH_MAX = 3099,
EBADMAGIC_QSWJOBINFO, ESLURM_JOBCOMP_MIN = 3100,
EINVAL_PRGCREATE, ESLURM_JOBCOMP_MAX = 3199,
ECHILD_PRGDESTROY, ESLURM_SCHED_MIN = 3200,
EEXIST_PRGDESTROY, ESLURM_SCHED_MAX = 3299,
EELAN3INIT, /* reserved for other plugin specific error codes up to 3999 */
EELAN3CONTROL,
EELAN3CREATE,
ESRCH_PRGADDCAP,
EFAULT_PRGADDCAP,
EINVAL_SETCAP,
EFAULT_SETCAP,
EGETNODEID,
EGETNODEID_BYHOST,
EGETHOST_BYNODEID,
ESRCH_PRGSIGNAL,
EINVAL_PRGSIGNAL,
/* slurmd error codes */ /* slurmd error codes */
ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN = 4000, ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN = 4000,
......
...@@ -42,13 +42,16 @@ ...@@ -42,13 +42,16 @@
#include <slurm/slurm_errno.h> #include <slurm/slurm_errno.h>
#include "src/common/slurm_jobcomp.h"
#include "src/common/switch.h"
/* Type for error string table entries */ /* Type for error string table entries */
typedef struct { typedef struct {
int xe_number; int xe_number;
char *xe_message; char *xe_message;
} slurm_errtab_t; } slurm_errtab_t;
/* Add new error values to xerrno.h, and their descriptions to this table */ /* Add new error values to slurm/slurm_errno.h, and their descriptions to this table */
static slurm_errtab_t slurm_errtab[] = { static slurm_errtab_t slurm_errtab[] = {
{0, "No error"}, {0, "No error"},
{-1, "Unspecified error"}, {-1, "Unspecified error"},
...@@ -156,45 +159,6 @@ static slurm_errtab_t slurm_errtab[] = { ...@@ -156,45 +159,6 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_INVALID_SWITCHTYPE_CHANGE, { ESLURM_INVALID_SWITCHTYPE_CHANGE,
"SwitchType change requires restart of all SLURM daemons and jobs"}, "SwitchType change requires restart of all SLURM daemons and jobs"},
/* Quadrics Elan routine error codes */
{ ENOSLURM, /* oh no! */
"Out of slurm" },
{ EBADMAGIC_QSWLIBSTATE,
"Bad magic in QSW libstate" },
{ EBADMAGIC_QSWJOBINFO,
"Bad magic in QSW jobinfo" },
{ EINVAL_PRGCREATE,
"Program identifier in use or CPU count invalid, try again" },
{ ECHILD_PRGDESTROY,
"Processes belonging to this program are still running" },
{ EEXIST_PRGDESTROY,
"Program identifier does not exist" },
{ EELAN3INIT,
"Too many processes using Elan or mapping failure" },
{ EELAN3CONTROL,
"Could not open elan3 control device" },
{ EELAN3CREATE,
"Could not create elan capability" },
{ ESRCH_PRGADDCAP,
"Program does not exist (addcap)" },
{ EFAULT_PRGADDCAP,
"Capability has invalid address (addcap)" },
{ EINVAL_SETCAP,
"Invalid context number (setcap)" },
{ EFAULT_SETCAP,
"Capability has invalid address (setcap)" },
{ EGETNODEID,
"Cannot determine local elan address" },
{ EGETNODEID_BYHOST,
"Cannot translate hostname to elan address" },
{ EGETHOST_BYNODEID,
"Cannot translate elan address to hostname" },
{ ESRCH_PRGSIGNAL,
"No such program identifier" },
{ EINVAL_PRGSIGNAL,
"Invalid signal number" },
/* slurmd error codes */ /* slurmd error codes */
{ ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN, { ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN,
...@@ -297,6 +261,25 @@ static char *_lookup_slurm_api_errtab(int errnum) ...@@ -297,6 +261,25 @@ static char *_lookup_slurm_api_errtab(int errnum)
break; break;
} }
} }
if ((res == NULL) &&
(errnum >= ESLURM_JOBCOMP_MIN) &&
(errnum <= ESLURM_JOBCOMP_MAX))
res = g_slurm_jobcomp_strerror(errnum);
#if 0
/* If needed, re-locate slurmctld/sched_plugin.[ch] into common */
if ((res == NULL) &&
(errnum >= ESLURM_SCHED_MIN) &&
(errnum <= ESLURM_SCHED_MAX))
res = sched_strerror(errnum);
#endif
if ((res == NULL) &&
(errnum >= ESLURM_SWITCH_MIN) &&
(errnum <= ESLURM_SWITCH_MAX))
res = switch_strerror(errnum);
return res; return res;
} }
......
...@@ -50,8 +50,10 @@ typedef struct slurm_jobcomp_ops { ...@@ -50,8 +50,10 @@ typedef struct slurm_jobcomp_ops {
char *job_state, char *partition, uint32_t time_limit, char *job_state, char *partition, uint32_t time_limit,
time_t start_time, time_t end_time, char *node_list); time_t start_time, time_t end_time, char *node_list);
int (*sa_errno) ( void ); int (*sa_errno) ( void );
char * (*job_strerror) ( int errno );
} slurm_jobcomp_ops_t; } slurm_jobcomp_ops_t;
/* /*
* A global job completion context. "Global" in the sense that there's * A global job completion context. "Global" in the sense that there's
* only one, with static bindings. We don't export it. * only one, with static bindings. We don't export it.
...@@ -129,7 +131,8 @@ _slurm_jobcomp_get_ops( slurm_jobcomp_context_t c ) ...@@ -129,7 +131,8 @@ _slurm_jobcomp_get_ops( slurm_jobcomp_context_t c )
static const char *syms[] = { static const char *syms[] = {
"slurm_jobcomp_set_location", "slurm_jobcomp_set_location",
"slurm_jobcomp_log_record", "slurm_jobcomp_log_record",
"slurm_jobcomp_get_errno" "slurm_jobcomp_get_errno",
"slurm_jobcomp_strerror"
}; };
int n_syms = sizeof( syms ) / sizeof( char * ); int n_syms = sizeof( syms ) / sizeof( char * );
...@@ -241,3 +244,17 @@ g_slurm_jobcomp_errno(void) ...@@ -241,3 +244,17 @@ g_slurm_jobcomp_errno(void)
slurm_mutex_unlock( &context_lock ); slurm_mutex_unlock( &context_lock );
return retval; return retval;
} }
extern char *
g_slurm_jobcomp_strerror(int errnum)
{
char *retval = NULL;
slurm_mutex_lock( &context_lock );
if ( g_context )
retval = (*(g_context->ops.job_strerror))(errnum);
else
error ("slurm_jobcomp plugin context not initialized");
slurm_mutex_unlock( &context_lock );
return retval;
}
...@@ -51,4 +51,8 @@ extern int g_slurm_jobcomp_write(uint32_t job_id, uint32_t user_id, char *job_na ...@@ -51,4 +51,8 @@ extern int g_slurm_jobcomp_write(uint32_t job_id, uint32_t user_id, char *job_na
/* return error code */ /* return error code */
extern int g_slurm_jobcomp_errno(void); extern int g_slurm_jobcomp_errno(void);
/* convert job completion logger specific error code to a string */
extern char *g_slurm_jobcomp_strerror(int errnum);
#endif /*__SLURM_JOBCOMP_H__*/ #endif /*__SLURM_JOBCOMP_H__*/
...@@ -75,6 +75,8 @@ typedef struct slurm_switch_ops { ...@@ -75,6 +75,8 @@ typedef struct slurm_switch_ops {
char ***env, uint32_t nodeid, char ***env, uint32_t nodeid,
uint32_t procid, uint32_t nnodes, uint32_t procid, uint32_t nnodes,
uint32_t nprocs, uint32_t rank); uint32_t nprocs, uint32_t rank);
char * (*switch_strerror) ( int errnum );
int (*switch_errno) ( void );
} slurm_switch_ops_t; } slurm_switch_ops_t;
struct slurm_switch_context { struct slurm_switch_context {
...@@ -164,7 +166,9 @@ _slurm_switch_get_ops( slurm_switch_context_t c ) ...@@ -164,7 +166,9 @@ _slurm_switch_get_ops( slurm_switch_context_t c )
"switch_p_job_init", "switch_p_job_init",
"switch_p_job_fini", "switch_p_job_fini",
"switch_p_job_postfini", "switch_p_job_postfini",
"switch_p_job_attach" "switch_p_job_attach",
"switch_p_strerror",
"switch_p_get_errno"
}; };
int n_syms = sizeof( syms ) / sizeof( char * ); int n_syms = sizeof( syms ) / sizeof( char * );
...@@ -388,3 +392,19 @@ extern int interconnect_attach(switch_jobinfo_t jobinfo, char ***env, ...@@ -388,3 +392,19 @@ extern int interconnect_attach(switch_jobinfo_t jobinfo, char ***env,
return (*(g_context->ops.job_attach)) (jobinfo, env, return (*(g_context->ops.job_attach)) (jobinfo, env,
nodeid, procid, nnodes, nprocs, gid); nodeid, procid, nnodes, nprocs, gid);
} }
extern int switch_get_errno(void)
{
if ( switch_init() < 0 )
return SLURM_ERROR;
return (*(g_context->ops.switch_errno))( );
}
extern char *switch_strerror(int errnum)
{
if ( switch_init() < 0 )
return SLURM_ERROR;
return (*(g_context->ops.switch_strerror))( errnum );
}
...@@ -46,7 +46,7 @@ ...@@ -46,7 +46,7 @@
typedef struct slurm_switch_context * slurm_switch_context_t; typedef struct slurm_switch_context * slurm_switch_context_t;
/*****************************************\ /*****************************************\
* GLOBAL SWITCH STATE MANGEMENT FUNCIONS* * GLOBAL SWITCH STATE MANGEMENT FUNCIONS *
\ *****************************************/ \ *****************************************/
/* initialize the switch plugin */ /* initialize the switch plugin */
...@@ -73,6 +73,15 @@ extern int switch_restore(char *dir_name); ...@@ -73,6 +73,15 @@ extern int switch_restore(char *dir_name);
*/ */
extern bool switch_no_frag(void); extern bool switch_no_frag(void);
/* return the number of a switch-specific error code */
extern int switch_get_errno(void);
/* return a string description of a switch specific error code
* IN errnum - switch specific error return code
* RET - string describing the nature of the error
*/
extern char *switch_strerror(int errnum);
/******************************************************\ /******************************************************\
* JOB-SPECIFIC SWITCH CREDENTIAL MANAGEMENT FUNCIONS * * JOB-SPECIFIC SWITCH CREDENTIAL MANAGEMENT FUNCIONS *
\******************************************************/ \******************************************************/
...@@ -135,6 +144,7 @@ extern void switch_print_jobinfo(FILE *fp, switch_jobinfo_t jobinfo); ...@@ -135,6 +144,7 @@ extern void switch_print_jobinfo(FILE *fp, switch_jobinfo_t jobinfo);
*/ */
extern char *switch_sprint_jobinfo( switch_jobinfo_t jobinfo, extern char *switch_sprint_jobinfo( switch_jobinfo_t jobinfo,
char *buf, size_t size); char *buf, size_t size);
/********************************************************************\ /********************************************************************\
* JOB LAUNCH AND MANAGEMENT FUNCTIONS RELATED TO SWITCH CREDENTIAL * * JOB LAUNCH AND MANAGEMENT FUNCTIONS RELATED TO SWITCH CREDENTIAL *
\********************************************************************/ \********************************************************************/
......
...@@ -52,6 +52,17 @@ ...@@ -52,6 +52,17 @@
#define JOB_FORMAT "JobId=%lu UserId=%s(%lu) Name=%s JobState=%s Partition=%s"\ #define JOB_FORMAT "JobId=%lu UserId=%s(%lu) Name=%s JobState=%s Partition=%s"\
" TimeLimit=%s StartTime=%s EndTime=%s NodeList=%s\n" " TimeLimit=%s StartTime=%s EndTime=%s NodeList=%s\n"
/* Type for error string table entries */
typedef struct {
int xe_number;
char *xe_message;
} slurm_errtab_t;
static slurm_errtab_t slurm_errtab[] = {
{0, "No error"},
{-1, "Unspecified error"}
};
/* /*
* These variables are required by the generic plugin interface. If they * These variables are required by the generic plugin interface. If they
...@@ -219,11 +230,35 @@ int slurm_jobcomp_log_record ( uint32_t job_id, uint32_t user_id, ...@@ -219,11 +230,35 @@ int slurm_jobcomp_log_record ( uint32_t job_id, uint32_t user_id,
return rc; return rc;
} }
int slurm_jobcomp_get_errno( void ) extern int slurm_jobcomp_get_errno( void )
{ {
return plugin_errno; return plugin_errno;
} }
/*
* Linear search through table of errno values and strings,
* returns NULL on error, string on success.
*/
static char *_lookup_slurm_api_errtab(int errnum)
{
char *res = NULL;
int i;
for (i = 0; i < sizeof(slurm_errtab) / sizeof(slurm_errtab_t); i++) {
if (slurm_errtab[i].xe_number == errnum) {
res = slurm_errtab[i].xe_message;
break;
}
}
return res;
}
extern char *slurm_jobcomp_strerror( int errnum )
{
char *res = _lookup_slurm_api_errtab(errnum);
return (res ? res : strerror(errnum));
}
int fini ( void ) int fini ( void )
{ {
if (job_comp_fd >= 0) if (job_comp_fd >= 0)
......
...@@ -104,6 +104,11 @@ int slurm_jobcomp_get_errno( void ) ...@@ -104,6 +104,11 @@ int slurm_jobcomp_get_errno( void )
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
char *slurm_jobcomp_strerror( int errnum )
{
return NULL;
}
int fini ( void ) int fini ( void )
{ {
return SLURM_SUCCESS; return SLURM_SUCCESS;
......
...@@ -38,6 +38,9 @@ const char plugin_name[] = "SLURM Backfill Scheduler plugin"; ...@@ -38,6 +38,9 @@ const char plugin_name[] = "SLURM Backfill Scheduler plugin";
const char plugin_type[] = "sched/backfill"; const char plugin_type[] = "sched/backfill";
const uint32_t plugin_version = 90; const uint32_t plugin_version = 90;
/* A plugin-global errno. */
static int plugin_errno = SLURM_SUCCESS;
static pthread_t backfill_thread; static pthread_t backfill_thread;
static bool thread_running = false; static bool thread_running = false;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
...@@ -111,3 +114,20 @@ void slurm_sched_plugin_job_is_pending( void ) ...@@ -111,3 +114,20 @@ void slurm_sched_plugin_job_is_pending( void )
{ {
run_backfill(); run_backfill();
} }
/**************************************************************************/
/* TAG( slurm_sched_get_errno ) */
/**************************************************************************/
int slurm_sched_get_errno( void )
{
return plugin_errno;
}
/**************************************************************************/
/* TAG( slurm_sched_strerror ) */
/**************************************************************************/
char *slurm_sched_strerror( int errnum )
{
return NULL;
}
...@@ -34,6 +34,9 @@ const char plugin_name[] = "SLURM Built-in Scheduler plugin"; ...@@ -34,6 +34,9 @@ const char plugin_name[] = "SLURM Built-in Scheduler plugin";
const char plugin_type[] = "sched/builtin"; const char plugin_type[] = "sched/builtin";
const uint32_t plugin_version = 90; const uint32_t plugin_version = 90;
/* A plugin-global errno. */
static int plugin_errno = SLURM_SUCCESS;
/**************************************************************************/ /**************************************************************************/
/* TAG( init ) */ /* TAG( init ) */
/**************************************************************************/ /**************************************************************************/
...@@ -83,3 +86,19 @@ void slurm_sched_plugin_job_is_pending( void ) ...@@ -83,3 +86,19 @@ void slurm_sched_plugin_job_is_pending( void )
/* Empty. */ /* Empty. */
} }
/**************************************************************************/
/* TAG( slurm_sched_get_errno ) */
/**************************************************************************/
int slurm_sched_get_errno( void )
{
return plugin_errno;
}
/**************************************************************************/
/* TAG( slurm_sched_strerror ) */
/**************************************************************************/
char *slurm_sched_strerror( int errnum )
{
return NULL;
}
...@@ -50,6 +50,9 @@ extern "C" { ...@@ -50,6 +50,9 @@ extern "C" {
uint32_t plugin_version = 90; uint32_t plugin_version = 90;
} }
/* A plugin-global errno. */
static int plugin_errno = SLURM_SUCCESS;
static pthread_t receptionist_thread; static pthread_t receptionist_thread;
static bool thread_running = false; static bool thread_running = false;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
...@@ -182,3 +185,21 @@ extern "C" void slurm_sched_plugin_job_is_pending( void ) ...@@ -182,3 +185,21 @@ extern "C" void slurm_sched_plugin_job_is_pending( void )
// Wiki does not respond to pending job // Wiki does not respond to pending job
} }
// *************************************************************************
// TAG( slurm_sched_get_errno )
// *************************************************************************
extern "C" int
slurm_sched_get_errno( void )
{
return plugin_errno;
}
// ************************************************************************
// TAG( slurm_sched_strerror )
// ************************************************************************
extern "C" char *
slurm_sched_strerror( int errnum )
{
return NULL;
}
...@@ -57,6 +57,30 @@ typedef struct qsw_libstate *qsw_libstate_t; ...@@ -57,6 +57,30 @@ typedef struct qsw_libstate *qsw_libstate_t;
#define QSW_MAX_TASKS ELAN_MAX_VPS #define QSW_MAX_TASKS ELAN_MAX_VPS
#define QSW_PACK_SIZE (4 * (2+4+1+8+ELAN_BITMAPSIZE)) #define QSW_PACK_SIZE (4 * (2+4+1+8+ELAN_BITMAPSIZE))
/* NOTE: error codes should be between ESLURM_SWITCH_MIN and
* ESLURM_SWITCH MAX as defined in slurm/slurm_errno.h */
enum {
/* Quadrics Elan specific error codes */
ENOSLURM = 3000,
EBADMAGIC_QSWLIBSTATE,
EBADMAGIC_QSWJOBINFO,
EINVAL_PRGCREATE,
ECHILD_PRGDESTROY,
EEXIST_PRGDESTROY,
EELAN3INIT,
EELAN3CONTROL,
EELAN3CREATE,
ESRCH_PRGADDCAP,
EFAULT_PRGADDCAP,
EINVAL_SETCAP,
EFAULT_SETCAP,
EGETNODEID,
EGETNODEID_BYHOST,
EGETHOST_BYNODEID,
ESRCH_PRGSIGNAL,
EINVAL_PRGSIGNAL
};
int qsw_alloc_libstate(qsw_libstate_t *lsp); int qsw_alloc_libstate(qsw_libstate_t *lsp);
void qsw_free_libstate(qsw_libstate_t ls); void qsw_free_libstate(qsw_libstate_t ls);
......
...@@ -69,6 +69,56 @@ static pthread_cond_t neterr_cond = PTHREAD_COND_INITIALIZER; ...@@ -69,6 +69,56 @@ static pthread_cond_t neterr_cond = PTHREAD_COND_INITIALIZER;
#endif /* HAVE_LIBELAN3 */ #endif /* HAVE_LIBELAN3 */
/* Type for error string table entries */
typedef struct {
int xe_number;
char *xe_message;
} slurm_errtab_t;
static slurm_errtab_t slurm_errtab[] = {
{0, "No error"},
{-1, "Unspecified error"},
/* Quadrics Elan routine error codes */
{ ENOSLURM, /* oh no! */
"Out of slurm" },
{ EBADMAGIC_QSWLIBSTATE,
"Bad magic in QSW libstate" },
{ EBADMAGIC_QSWJOBINFO,
"Bad magic in QSW jobinfo" },
{ EINVAL_PRGCREATE,
"Program identifier in use or CPU count invalid, try again" },
{ ECHILD_PRGDESTROY,
"Processes belonging to this program are still running" },
{ EEXIST_PRGDESTROY,
"Program identifier does not exist" },
{ EELAN3INIT,
"Too many processes using Elan or mapping failure" },
{ EELAN3CONTROL,
"Could not open elan3 control device" },
{ EELAN3CREATE,
"Could not create elan capability" },
{ ESRCH_PRGADDCAP,
"Program does not exist (addcap)" },
{ EFAULT_PRGADDCAP,
"Capability has invalid address (addcap)" },
{ EINVAL_SETCAP,
"Invalid context number (setcap)" },
{ EFAULT_SETCAP,
"Capability has invalid address (setcap)" },
{ EGETNODEID,
"Cannot determine local elan address" },
{ EGETNODEID_BYHOST,
"Cannot translate hostname to elan address" },
{ EGETHOST_BYNODEID,
"Cannot translate elan address to hostname" },
{ ESRCH_PRGSIGNAL,
"No such program identifier" },
{ EINVAL_PRGSIGNAL,
"Invalid signal number" }
};
/* /*
* These variables are required by the generic plugin interface. If they * These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it. * are not found in the plugin, the plugin loader will ignore it.
...@@ -129,7 +179,7 @@ int switch_p_libstate_save (char *dir_name) ...@@ -129,7 +179,7 @@ int switch_p_libstate_save (char *dir_name)
char *file_name; char *file_name;
if (qsw_alloc_libstate(&old_state)) if (qsw_alloc_libstate(&old_state))
return errno; return SLURM_ERROR;
qsw_fini(old_state); qsw_fini(old_state);
buffer = init_buf(1024); buffer = init_buf(1024);
(void) qsw_pack_libstate(old_state, buffer); (void) qsw_pack_libstate(old_state, buffer);
...@@ -140,7 +190,7 @@ int switch_p_libstate_save (char *dir_name) ...@@ -140,7 +190,7 @@ int switch_p_libstate_save (char *dir_name)
if (state_fd == 0) { if (state_fd == 0) {
error ("Can't save state, error creating file %s %m", error ("Can't save state, error creating file %s %m",
file_name); file_name);
error_code = errno; error_code = SLURM_ERROR;
} else { } else {
char *buf = get_buf_data(buffer); char *buf = get_buf_data(buffer);
size_t len =get_buf_offset(buffer); size_t len =get_buf_offset(buffer);
...@@ -152,7 +202,7 @@ int switch_p_libstate_save (char *dir_name) ...@@ -152,7 +202,7 @@ int switch_p_libstate_save (char *dir_name)
break; break;
if (wrote < 0) { if (wrote < 0) {
error ("Can't save switch state: %m"); error ("Can't save switch state: %m");
error_code = errno; error_code = SLURM_ERROR;
break; break;
} }
buf += wrote; buf += wrote;
...@@ -216,7 +266,7 @@ int switch_p_libstate_restore (char *dir_name) ...@@ -216,7 +266,7 @@ int switch_p_libstate_restore (char *dir_name)
} else { } else {
buffer = create_buf (data, data_size); buffer = create_buf (data, data_size);
if (qsw_unpack_libstate(old_state, buffer) < 0) if (qsw_unpack_libstate(old_state, buffer) < 0)
error_code = errno; error_code = SLURM_ERROR;
} }
} }
...@@ -270,7 +320,8 @@ int switch_p_build_jobinfo ( switch_jobinfo_t switch_job, char *nodelist, ...@@ -270,7 +320,8 @@ int switch_p_build_jobinfo ( switch_jobinfo_t switch_job, char *nodelist,
else { else {
error("qsw_getnodeid_byhost(%s) failure", error("qsw_getnodeid_byhost(%s) failure",
this_node_name); this_node_name);
error_code = ESLURM_INTERCONNECT_FAILURE; slurm_seterrno(ESLURM_INTERCONNECT_FAILURE);
error_code = SLURM_ERROR;
} }
free(this_node_name); free(this_node_name);
} }
...@@ -332,7 +383,6 @@ char *switch_p_sprint_jobinfo(switch_jobinfo_t switch_jobinfo, char *buf, ...@@ -332,7 +383,6 @@ char *switch_p_sprint_jobinfo(switch_jobinfo_t switch_jobinfo, char *buf,
int switch_p_node_init ( void ) int switch_p_node_init ( void )
{ {
#if HAVE_LIBELAN3 #if HAVE_LIBELAN3
int err = 0;
pthread_attr_t attr; pthread_attr_t attr;
/* /*
...@@ -343,19 +393,18 @@ int switch_p_node_init ( void ) ...@@ -343,19 +393,18 @@ int switch_p_node_init ( void )
* Load neterr elanid/hostname values into kernel * Load neterr elanid/hostname values into kernel
*/ */
if (_set_elan_ids() < 0) if (_set_elan_ids() < 0)
return SLURM_FAILURE; return SLURM_ERROR;
if ((err = pthread_attr_init(&attr))) if (pthread_attr_init(&attr))
error("pthread_attr_init: %s", slurm_strerror(err)); error("pthread_attr_init: %m");
err = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
if (err) error("pthread_attr_setdetachstate: %m");
error("pthread_attr_setdetachstate: %s", slurm_strerror(err));
slurm_mutex_lock(&neterr_mutex); slurm_mutex_lock(&neterr_mutex);
if ((err = pthread_create(&neterr_tid, &attr, _neterr_thr, NULL))) if (pthread_create(&neterr_tid, &attr, _neterr_thr, NULL))
return SLURM_FAILURE; return SLURM_ERROR;
/* /*
* Wait for successful startup of neterr thread before * Wait for successful startup of neterr thread before
...@@ -430,12 +479,11 @@ static void *_neterr_thr(void *arg) ...@@ -430,12 +479,11 @@ static void *_neterr_thr(void *arg)
int switch_p_node_fini ( void ) int switch_p_node_fini ( void )
{ {
#if HAVE_LIBELAN3 #if HAVE_LIBELAN3
int err = pthread_cancel(neterr_tid); if (pthread_cancel(neterr_tid) == 0)
if (err == 0)
return SLURM_SUCCESS; return SLURM_SUCCESS;
error("Unable to cancel neterr thread: %s", slurm_strerror(err)); error("Unable to cancel neterr thread: %m");
return SLURM_FAILURE; return SLURM_ERROR;
#else /* !HAVE_LIBELAN3 */ #else /* !HAVE_LIBELAN3 */
return SLURM_SUCCESS; return SLURM_SUCCESS;
...@@ -562,7 +610,41 @@ _set_elan_ids(void) ...@@ -562,7 +610,41 @@ _set_elan_ids(void)
error("elan3_load_neterr_svc(%d, %s): %m", i, host); error("elan3_load_neterr_svc(%d, %s): %m", i, host);
} }
return 0; return SLURM_SUCCESS;
} }
#endif #endif
/*
* Linear search through table of errno values and strings,
* returns NULL on error, string on success.
*/
static char *_lookup_slurm_api_errtab(int errnum)
{
char *res = NULL;
int i;
for (i = 0; i < sizeof(slurm_errtab) / sizeof(slurm_errtab_t); i++) {
if (slurm_errtab[i].xe_number == errnum) {
res = slurm_errtab[i].xe_message;
break;
}
}
return res;
}
extern int switch_p_get_errno(void)
{
int err = slurm_get_errno();
if ((err >= ESLURM_SWITCH_MIN) && (err <= ESLURM_SWITCH_MAX))
return err;
return SLURM_SUCCESS;
}
extern char *switch_p_strerror(int errnum)
{
char *res = _lookup_slurm_api_errtab(errnum);
return (res ? res : strerror(errnum));
}
...@@ -97,11 +97,6 @@ int switch_p_libstate_restore ( char * dir_name ) ...@@ -97,11 +97,6 @@ int switch_p_libstate_restore ( char * dir_name )
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
bool switch_p_no_frag ( void )
{
return false;
}
/* /*
* switch functions for job step specific credential * switch functions for job step specific credential
*/ */
...@@ -199,3 +194,21 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env, ...@@ -199,3 +194,21 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env,
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
/*
* switch functions for other purposes
*/
bool switch_p_no_frag ( void )
{
return false;
}
extern int switch_p_get_errno(void)
{
return SLURM_SUCCESS;
}
extern char *switch_p_strerror(int errnum)
{
return NULL;
}
...@@ -42,6 +42,8 @@ typedef struct slurm_sched_ops { ...@@ -42,6 +42,8 @@ typedef struct slurm_sched_ops {
int (*schedule) ( void ); int (*schedule) ( void );
u_int32_t (*initial_priority) ( u_int32_t ); u_int32_t (*initial_priority) ( u_int32_t );
void (*job_is_pending) ( void ); void (*job_is_pending) ( void );
int (*get_errno) ( void );
char * (*strerror) ( int );
} slurm_sched_ops_t; } slurm_sched_ops_t;
...@@ -72,7 +74,9 @@ slurm_sched_get_ops( slurm_sched_context_t *c ) ...@@ -72,7 +74,9 @@ slurm_sched_get_ops( slurm_sched_context_t *c )
static const char *syms[] = { static const char *syms[] = {
"slurm_sched_plugin_schedule", "slurm_sched_plugin_schedule",
"slurm_sched_plugin_initial_priority", "slurm_sched_plugin_initial_priority",
"slurm_sched_plugin_job_is_pending" "slurm_sched_plugin_job_is_pending",
"slurm_sched_get_errno",
"slurm_sched_strerror"
}; };
int n_syms = sizeof( syms ) / sizeof( char * ); int n_syms = sizeof( syms ) / sizeof( char * );
...@@ -235,3 +239,26 @@ slurm_sched_job_is_pending( void ) ...@@ -235,3 +239,26 @@ slurm_sched_job_is_pending( void )
(*(g_sched_context->ops.job_is_pending))(); (*(g_sched_context->ops.job_is_pending))();
} }
/* *********************************************************************** */
/* TAG( slurm_sched_p_get_errno ) */
/* *********************************************************************** */
int
slurm_sched_p_get_errno( void )
{
if ( slurm_sched_init() < 0 )
return SLURM_ERROR;
return (*(g_sched_context->ops.get_errno))( );
}
/* *********************************************************************** */
/* TAG( slurm_sched_p_strerror ) */
/* *********************************************************************** */
char *
slurm_sched_p_strerror( int errnum )
{
if ( slurm_sched_init() < 0 )
return NULL;
return (*(g_sched_context->ops.strerror))( errnum );
}
...@@ -31,6 +31,16 @@ u_int32_t slurm_sched_initial_priority( u_int32_t max_prio ); ...@@ -31,6 +31,16 @@ u_int32_t slurm_sched_initial_priority( u_int32_t max_prio );
*/ */
void slurm_sched_job_is_pending( void ); void slurm_sched_job_is_pending( void );
/*
* Return any plugin-specific error number
*/
int slurm_sched_p_get_errno( void );
/*
* Return any plugin-specific error description
*/
char *slurm_sched_p_strerror( int errnum );
/* /*
************************************************************************** **************************************************************************
* U P C A L L S * * U P C A L L S *
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment