Newer
Older
/*****************************************************************************\
* slurmctld.h - definitions of functions and structures for slurmcltd use
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette@llnl.gov> et. al.
* UCRL-CODE-2002-040.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
\*****************************************************************************/
#ifndef _HAVE_SLURM_H
#define _HAVE_SLURM_H
#if HAVE_CONFIG_H
# include "config.h"
# if HAVE_INTTYPES_H
# include <inttypes.h>
# else
# if HAVE_STDINT_H
# include <stdint.h>
# endif
# endif /* HAVE_INTTYPES_H */
#endif
/* #include <stdlib.h> */
#include <time.h>
#include <sys/types.h>
#include <unistd.h>
#ifdef WITH_PTHREADS
# include <pthread.h>
#endif /* WITH_PTHREADS */
#include <slurm/slurm.h>
#include "src/common/bitstring.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/macros.h"
#include "src/common/pack.h"
#include "src/common/slurm_cred.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/switch.h"
#include "src/common/xmalloc.h"
#define FREE_NULL_BITMAP(_X) \
do { \
if (_X) bit_free (_X); \
_X = NULL; \
} while (0)
#define IS_JOB_FINISHED(_X) \
((_X->job_state & (~JOB_COMPLETING)) > JOB_RUNNING)
#define IS_JOB_PENDING(_X) \
((_X->job_state & (~JOB_COMPLETING)) == JOB_PENDING)
/*****************************************************************************\
* GENERAL CONFIGURATION parameters and data structures
\*****************************************************************************/
/* Maximum parallel threads to service incoming RPCs */
#define MAX_SERVER_THREADS 60
/* Perform full slurmctld's state every PERIODIC_CHECKPOINT seconds */
#define PERIODIC_CHECKPOINT 300
/* Retry an incomplete RPC agent request every RPC_RETRY_INTERVAL seconds */
#define RPC_RETRY_INTERVAL 60
/* Attempt to schedule jobs every PERIODIC_SCHEDULE seconds despite
* any RPC activity. This will catch any state transisions that may
* have otherwise been missed */
#define PERIODIC_SCHEDULE 60
/* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */
#define PERIODIC_TIMEOUT 60
/* Pathname of group file record for checking update times */
#define GROUP_FILE "/etc/group"
/* Check for updates to GROUP_FILE every PERIODIC_GROUP_CHECK seconds,
* Update the group uid_t access list as needed */
#define PERIODIC_GROUP_CHECK 600
/* Seconds to wait for backup controller response to REQUEST_CONTROL RPC */
#define CONTROL_TIMEOUT 4
/*****************************************************************************\
* General configuration parameters and data structures
\*****************************************************************************/
typedef struct slurmctld_config {
int daemonize;
bool resume_backup;
time_t shutdown_time;
int server_thread_count;
slurm_cred_ctx_t cred_ctx;
#ifdef WITH_PTHREADS
pthread_mutex_t thread_count_lock;
pthread_t thread_id_main;
pthread_t thread_id_save;
pthread_t thread_id_sig;
pthread_t thread_id_rpc;
#else
int thread_count_lock;
int thread_id_main;
int thread_id_save;
int thread_id_sig;
int thread_id_rpc;
#endif
} slurmctld_config_t;
extern slurmctld_config_t slurmctld_config;
extern slurm_ctl_conf_t slurmctld_conf;
/*****************************************************************************\
* NODE parameters and data structures
\*****************************************************************************/
#define MAX_NAME_LEN 32
#define CONFIG_MAGIC 0xc065eded
#define NODE_MAGIC 0x0de575ed
struct config_record {
uint32_t magic; /* magic cookie to test data integrity */
uint32_t cpus; /* count of cpus running on the node */
uint32_t real_memory; /* MB real memory on the node */
uint32_t tmp_disk; /* MB total storage in TMP_FS file system */
uint32_t weight; /* arbitrary priority of node for
* scheduling work on */
char *feature; /* arbitrary list of features associated */
char *nodes; /* name of nodes with this configuration */
bitstr_t *node_bitmap; /* bitmap of nodes with this configuration */
extern List config_list; /* list of config_record entries */
struct node_record {
uint32_t magic; /* magic cookie for data integrity */
char name[MAX_NAME_LEN]; /* name of the node. NULL==defunct */
uint16_t node_state; /* enum node_states, ORed with
* NODE_STATE_NO_RESPOND if not
* responding */
time_t last_response; /* last response from the node */
uint32_t cpus; /* count of cpus on the node */
uint32_t real_memory; /* MB real memory on the node */
uint32_t tmp_disk; /* MB total disk in TMP_FS */
struct config_record *config_ptr; /* configuration spec ptr */
struct part_record *partition_ptr; /* partition for this node */
char comm_name[MAX_NAME_LEN]; /* communications path name to node */
slurm_addr slurm_addr; /* network address */
uint16_t comp_job_cnt; /* count of jobs completing on node */
uint16_t run_job_cnt; /* count of jobs running on node */
uint16_t no_share_job_cnt; /* count of jobs running that will
* not share nodes */
char *reason; /* why a node is DOWN or DRAINING */
struct node_record *node_next; /* next entry with same hash index */
extern struct node_record *node_record_table_ptr; /* ptr to node records */
extern time_t last_bitmap_update; /* time of last node creation or
extern time_t last_node_update; /* time of last node record update */
extern int node_record_count; /* count in node_record_table_ptr */
extern bitstr_t *avail_node_bitmap; /* bitmap of available nodes,
* not DOWN, DRAINED or DRAINING */
extern bitstr_t *idle_node_bitmap; /* bitmap of idle nodes */
extern bitstr_t *share_node_bitmap; /* bitmap of sharable nodes */
extern struct config_record default_config_record;
extern struct node_record default_node_record;
/*****************************************************************************\
* PARTITION parameters and data structures
\*****************************************************************************/
#define PART_MAGIC 0xaefe8495
struct part_record {
uint32_t magic; /* magic cookie to test data integrity */
char name[MAX_NAME_LEN];/* name of the partition */
uint16_t hidden; /* 1 if hidden by default */
uint32_t max_time; /* minutes or INFINITE */
uint32_t max_nodes; /* per job or INFINITE */
uint32_t min_nodes; /* per job */
uint32_t total_nodes; /* total number of nodes in the partition */
uint32_t total_cpus; /* total number of cpus in the partition */
uint16_t root_only; /* 1 if allocate/submit RPC can only be
issued by user root */
uint16_t shared; /* 1 if job can share a node,
2 if sharing required */
uint16_t state_up; /* 1 if state is up, 0 if down */
char *nodes; /* comma delimited list names of nodes */
char *allow_groups; /* comma delimited list of groups,
* NULL indicates all */
uid_t *allow_uids; /* zero terminated list of allowed users */
bitstr_t *node_bitmap; /* bitmap of nodes in partition */
};
extern List part_list; /* list of part_record entries */
extern time_t last_part_update; /* time of last part_list update */
extern struct part_record default_part; /* default configuration values */
extern char default_part_name[MAX_NAME_LEN]; /* name of default partition */
extern struct part_record *default_part_loc; /* default partition ptr */
/*****************************************************************************\
* JOB parameters and data structures
\*****************************************************************************/
extern time_t last_job_update; /* time of last update to part records */
#define DETAILS_MAGIC 0xdea84e7
#define JOB_MAGIC 0xf0b7392c
#define STEP_MAGIC 0xce593bc1
#define KILL_ON_STEP_DONE 1
extern int job_count; /* number of jobs in the system */
/* job_details - specification of a job's constraints,
* can be purged after initiation */
uint32_t magic; /* magic cookie for data integrity */
uint32_t min_nodes; /* minimum number of nodes */
uint32_t max_nodes; /* maximum number of nodes */
char *req_nodes; /* required nodes */
char *exc_nodes; /* excluded nodes */
bitstr_t *req_node_bitmap; /* bitmap of required nodes */
bitstr_t *exc_node_bitmap; /* bitmap of excluded nodes */
char *features; /* required features */
uint16_t req_tasks; /* required number of tasks */
uint16_t shared; /* set node can be shared*/
uint16_t contiguous; /* set if requires contiguous nodes */
uint16_t wait_reason; /* reason job still pending, see
* slurm.h:enum job_wait_reason */
uint32_t min_procs; /* minimum processors per node */
uint32_t min_memory; /* minimum memory per node, MB */
uint32_t min_tmp_disk; /* minimum tempdisk per node, MB */
char *err; /* pathname of job's stderr file */
char *in; /* pathname of job's stdin file */
char *out; /* pathname of job's stdout file */
uint32_t total_procs; /* number of allocated processors,
for accounting */
time_t submit_time; /* time of submission */
char *work_dir; /* pathname of working directory */
char **argv; /* arguments for a batch job script */
uint16_t argc; /* count of argv elements */
};
struct job_record {
uint32_t job_id; /* job ID */
uint32_t magic; /* magic cookie for data integrity */
char name[MAX_NAME_LEN]; /* name of the job */
char partition[MAX_NAME_LEN]; /* name of the partition */
struct part_record *part_ptr; /* pointer to the partition record */
uint16_t batch_flag; /* 1 if batch job (with script) */
uint32_t user_id; /* user the job runs as */
uint32_t group_id; /* group submitted under */
enum job_states job_state; /* state of the job */
uint16_t kill_on_node_fail; /* 1 if job should be killed on
* node failure */
uint16_t kill_on_step_done; /* 1 if job should be killed when
* the job step completes, 2 if kill
* in progress */
select_jobinfo_t select_jobinfo; /* opaque data */
char *nodes; /* list of nodes allocated to job */
bitstr_t *node_bitmap; /* bitmap of nodes allocated to job */
uint32_t num_procs; /* count of required/allocated processors */
uint32_t time_limit; /* time_limit minutes or INFINITE,
* NO_VAL implies partition max_time */
time_t start_time; /* time execution begins,
* actual or expected */
time_t end_time; /* time of termination,
* actual or expected */
time_t time_last_active; /* time of last job activity */
uint32_t priority; /* relative priority of the job,
* zero == held (don't initiate) */
struct job_details *details; /* job details */
uint16_t num_cpu_groups; /* record count in cpus_per_node and
Loading
Loading full blame...