From 5a19e2fe784f958dea0cabf560b19c9751e1b9d9 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 4 Jan 2006 16:22:44 +0000 Subject: [PATCH] Remove very old (and now misleading) design documents. --- doc/txt/API.for.DPCS.txt | 213 ------------ doc/txt/bgl.txt | 45 --- doc/txt/job.api.txt | 90 ----- doc/txt/job.initiation.design.txt | 188 ---------- doc/txt/job.manager.design.txt | 69 ---- doc/txt/job_data.txt | 107 ------ doc/txt/job_manager_demo.txt | 113 ------ doc/txt/machine.status.demo.txt | 45 --- doc/txt/machine.status.design.txt | 298 ---------------- doc/txt/message.summary.txt | 256 -------------- doc/txt/misc.issues.txt | 16 - doc/txt/partition.demo.txt | 29 -- doc/txt/partition.design.txt | 207 ----------- doc/txt/slurm.protocol.txt | 559 ------------------------------ 14 files changed, 2235 deletions(-) delete mode 100644 doc/txt/API.for.DPCS.txt delete mode 100644 doc/txt/bgl.txt delete mode 100644 doc/txt/job.api.txt delete mode 100644 doc/txt/job.initiation.design.txt delete mode 100644 doc/txt/job.manager.design.txt delete mode 100644 doc/txt/job_data.txt delete mode 100644 doc/txt/job_manager_demo.txt delete mode 100644 doc/txt/machine.status.demo.txt delete mode 100644 doc/txt/machine.status.design.txt delete mode 100644 doc/txt/message.summary.txt delete mode 100644 doc/txt/misc.issues.txt delete mode 100644 doc/txt/partition.demo.txt delete mode 100644 doc/txt/partition.design.txt delete mode 100644 doc/txt/slurm.protocol.txt diff --git a/doc/txt/API.for.DPCS.txt b/doc/txt/API.for.DPCS.txt deleted file mode 100644 index fea897811c9..00000000000 --- a/doc/txt/API.for.DPCS.txt +++ /dev/null @@ -1,213 +0,0 @@ -Pre-Alpha SLURM API for DPCS -Kevin Tew -June 17, 2002 - -Necessary include files -<slurm.h> - -All interaction with slurm consists of structured messages -All messages returned to a double pointer **, and must be freed by calling the corresponding free function - ---------------------------- -API Layer init/destroy functions ---------------------------- -int slurm_set_api_config ( slurm_protocol_config_t * slurm_api_conf ) - Sets the current api configuration - -slurm_protocol_config_t * slurm_get_api_config ( ) - Returns the current api configuration - -int slurm_set_default_controllers ( char * primary_controller_hostname , char * -secondary_controller_hostname, uint16_t pri_port , uint16_t sec_port ) - This will set the default controllers' addresses and ports. This is -the minimal amount of initialization that must occur before using the api -functions - ---------------------------- -INFO Message functions ---------------------------- -For all of the following are informational calls, if update_time is equal to the last time changes where made, nothing is returned. Otherwise all the node, job, or partition records are returned. - -int slurm_load_jobs (time_t update_time, job_info_msg_t **job_info_msg_pptr) - Returns a job_info_msg_t that contains an array of job_table records - -int slurm_load_node (time_t update_time, node_info_msg_t **node_info_msg_pptr) - Returns a node_info_msg_t that contains an array of node_table records - -int slurm_load_partitions (time_t update_time, partition_info_msg_t **partition_info_msg_pptr) - Returns a partition_info_msg_t that contains an array of partition_table records - - ---------------------------- -JOB Allocation functions ---------------------------- -Once a job_desc_msg_t structure has been declared it must first be initialized by the following function before it is used. -void slurm_init_job_desc_msg ( job_desc_msg_t * job_desc_msg ) ; - -int slurm_submit_batch_job ( job_desc_msg_t * job_desc_msg ) -Used to submit a batch/script job - -int slurm_will_job_run ( job_desc_msg_t * job_desc_msg , job_allocation_response_msg_t ** job_alloc_msg ) -Returns success if the resources are available immediately to launch the job - -int slurm_allocate_resources (job_desc_msg_t * job_desc_msg , job_allocation_response_msg_t ** job_alloc_msg , int immediately ) -Blocks until resources are allocated. If the immediate flag is set, the job will either allocate immediately or return a failure code if resources are not available.. - - ---------------------------- -JOB Cancel function ---------------------------- -int slurm_cancel_job ( uint32_t job_id ) - the job id of the job to cancel. - - - ---------------------------- -Error Code Functions ---------------------------- -Slurm Functions will return -1 on error and will provide detailed error information using the following functions - - int slurm_get_errno () - char * slurm_strerror ( int errno ) - - - ---------------------------- -Free Functions ---------------------------- -void slurm_free_job_allocation_response_msg ( job_allocation_response_msg_t * msg ) ; -void slurm_free_job_desc_msg ( job_desc_msg_t * msg ) ; - -void slurm_free_job_info ( job_info_msg_t * msg ) ; -void slurm_free_job_table ( job_table_t * job ) ; - -void slurm_free_partition_info ( partition_info_msg_t * msg ) ; -void slurm_free_partition_table ( partition_table_t * part ) ; - -void slurm_free_node_info ( node_info_msg_t * msg ) ; -void slurm_free_node_table ( node_table_t * node ) ; - - - ---------------------------- -Message Structure definitions ---------------------------- -SEE src/common/slurm_protocol_defs.h for the latest structure. - -typedef struct slurm_job_allocation_response_msg -{ - uint32_t job_id; - char* node_list; -} job_allocation_response_msg_t ; - - - -typedef struct job_desc_msg { /* Job descriptor for submit, allocate, and update requests */ - uint16_t contiguous; /* 1 if job requires contiguous nodes, 0 otherwise, - * default=0 */ - char *features; /* comma separated list of required features, default NONE */ - char *groups; /* comma separated list of groups the user can access, - * default set output of "/usr/bin/groups" by API, - * can only be set if user is root */ - uint32_t job_id; /* job ID, default set by SLURM */ - char *name; /* name of the job, default "" */ - void *partition_key; /* root key to submit job, format TBD, default NONE */ - uint32_t min_procs; /* minimum processors required per node, default=0 */ - uint32_t min_memory; /* minimum real memory required per node, default=0 */ - uint32_t min_tmp_disk; /* minimum temporary disk required per node, default=0 */ - char *partition; /* name of requested partition, default in SLURM config */ - uint32_t priority; /* relative priority of the job, default set by SLURM, - * can only be explicitly set if user is root, maximum - * * value is #fffffffe */ - char *req_nodes; /* comma separated list of required nodes, default NONE */ - char *job_script; /* pathname of required script, default NONE */ - uint16_t shared; /* 1 if job can share nodes with other jobs, 0 otherwise, - * default in SLURM configuration */ - uint32_t time_limit; /* maximum run time in minutes, default is partition - * limit as defined in SLURM configuration, maximum - * * value is #fffffffe */ - uint32_t num_procs; /* number of processors required by job, default=0 */ - uint32_t num_nodes; /* number of nodes required by job, default=0 */ - uint32_t dist; - uint32_t procs_per_task; - uint32_t user_id; /* set only if different from current UID, default set - * to UID by API, can only be set if user is root */ -} job_desc_msg_t ; - -struct job_table { - uint32_t job_id; /* job ID */ - char *name; /* name of the job */ - uint32_t user_id; /* user the job runs as */ - uint16_t job_state; /* state of the job, see enum job_states */ - uint32_t time_limit; /* maximum run time in minutes or INFINITE */ - time_t start_time; /* time execution begins, actual or expected*/ - time_t end_time; /* time of termination, actual or expected */ - uint32_t priority; /* relative priority of the job */ - char *nodes; /* comma delimited list of nodes allocated to job */ - int *node_inx; /* list index pairs into node_table for *nodes: - start_range_1, end_range_1, start_range_2, .., -1 */ - char *partition; /* name of assigned partition */ - uint32_t num_procs; /* number of processors required by job */ - uint32_t num_nodes; /* number of nodes required by job */ - uint16_t shared; /* 1 if job can share nodes with other jobs */ - uint16_t contiguous; /* 1 if job requires contiguous nodes */ - uint32_t min_procs; /* minimum processors required per node */ - uint32_t min_memory; /* minimum real memory required per node */ - uint32_t min_tmp_disk; /* minimum temporary disk required per node */ - char *req_nodes; /* comma separated list of required nodes */ - int *req_node_inx; /* list index pairs into node_table for *req_nodes: - start_range_1, end_range_1, start_range_2, .., -1 */ - char *features; /* comma separated list of required features */ - char *job_script; /* pathname of required script */ -}; -typedef struct job_table job_table_t ; - -typedef struct job_info_msg { - uint32_t last_update; - uint32_t record_count; - job_table_t * job_array; -} job_info_msg_t ; - -struct part_table { - char *name; /* name of the partition */ - uint32_t max_time; /* minutes or INFINITE */ - uint32_t max_nodes; /* per job or INFINITE */ - uint32_t total_nodes; /* total number of nodes in the partition */ - uint32_t total_cpus; /* total number of cpus in the partition */ - uint16_t default_part; /* 1 if this is default partition */ - uint16_t key; /* 1 if slurm distributed key is required for use */ - uint16_t shared; /* 1 if job can share nodes, 2 if job must share nodes */ - uint16_t state_up; /* 1 if state is up, 0 if down */ - char *nodes; /* comma delimited list names of nodes in partition */ - int *node_inx; /* list index pairs into node_table: - start_range_1, end_range_1, start_range_2, .., -1 */ - char *allow_groups; /* comma delimited list of groups, null indicates all */ -} ; -typedef struct part_table partition_table_t ; - -typedef struct partition_info_msg { - uint32_t last_update; - uint32_t record_count; - partition_table_t * partition_array; -} partition_info_msg_t ; - -struct node_table { - char *name; /* node name */ - uint16_t node_state; /* enum node_states, ORed with STATE_NO_RESPOND if down */ - uint32_t cpus; /* configured count of cpus running on the node */ - uint32_t real_memory; /* configured megabytes of real memory on the node */ - uint32_t tmp_disk; /* configured megabytes of total disk in TMP_FS */ - uint32_t weight; /* arbitrary priority of node for scheduling work on */ - char *features; /* arbitrary list of features associated with a node */ - char *partition; /* name of partition node configured to */ -}; -typedef struct node_table node_table_t ; -typedef struct node_table node_table_msg_t ; - -typedef struct node_info_msg { - uint32_t last_update; - uint32_t record_count; - node_table_t * node_array; -} node_info_msg_t ; - - diff --git a/doc/txt/bgl.txt b/doc/txt/bgl.txt deleted file mode 100644 index c388f354ffd..00000000000 --- a/doc/txt/bgl.txt +++ /dev/null @@ -1,45 +0,0 @@ -Notes for SLURM use with Blue Gene/L -As of 19 August 2004 - -There is still much development work required, but some basic functionality is -in place and usable for development purposes. - - -TO BUILD -Download a current version of the SLURM code (version 0.4 or later). -Execute "configure". -Manually add "#define HAVE_BGL" to the file "config.h". This will - be fixed later with autoconf. -Execute "make". -Execute "make install". -Build a configuration file: - Set "InactiveLimit=0" (or leave to the default value of zero). - Define the base partitions using a three-digit suffix indicating its - position in the X, Y, and Z dimentions. Set their NodeAddr to the - location where the slurmd daemon will execute (the same node for - all of the base partitions). Note that SLURM itself is not aware - of the actual node location, but this will work. - - -TO RUN -Execute "slurmctld" and *one* copy of "slurmd". -The scontrol, scancel, squeue, and sinfo commands execute as desired. -The srun command works fine too, but BGL will not support proper job - steps. There will be a job allocation and mpirun will use the - allocated BGL resources by virtue of environment variables. Attempts - to launch job steps will function, but all tasks get started on the - one node where the slurmd daemon executes. -Most of the existing SLURM test suite runs, but some tests fail due to - the defined node names (e.g. "bgl123") not matching the real node - names reported by the hostname command. srun's attach option will - only work for batch jobs, which will be the normal BGL mode of operation. - -OTHER -Execution of job steps has been disabled except for super-users (root or - SlurmUser). This is required to avoid overwhelming the one slurmd for - the entire system. The parallel tasks will be initiated exclusively - by mpirun. We could wrap mpirun to record the initiated of job steps, - but real support of that job step as a real SLURM entity (initiation, - signals, completion, etc.) would require a fair bit of work and provide - little real benefit. - diff --git a/doc/txt/job.api.txt b/doc/txt/job.api.txt deleted file mode 100644 index 98d9cf9b793..00000000000 --- a/doc/txt/job.api.txt +++ /dev/null @@ -1,90 +0,0 @@ -The user defines a structure job_desc as below. He then calls slurm_job_desc_init(), -which sets the values to initial values (#ffff, #ffffffff, or NULL for the pointers). -The user then set any values as desired, likely only a small subset of the values. -Some of the values can be set only by user root (e.g. groups, priority, and user_id). -For example, we ignore the request from user "tewk" to run as user "root" with -group "root" and high priority. The user then calls the appropriate API (allocate, -submit, update, will_run). The API call packs the values reset by the user, including -proper values for user_id and group_id. Only the values set by the user are sent, -packing a uit32_t value first with flags indicating which values are being sent -(see "#define JOB_DESC_OPT_CONTIGUOUS" and below). The functions return an error -code as defined below (see "#define SLURM_SUCCESS" and below). slurm_allocate and -slurm_submit functions will return a job_id upon success. - -#define NO_VAL_16 #ffff /* value of uint16_t variable until user reset */ -#define NO_VAL_32 #ffffffff /* value of uint32_t variable until user reset */ - -structure job_desc { /* Job descriptor for submit, allocate, and update requests */ - uint16_t contiguous; /* 1 if job requires contiguous nodes, 0 otherwise, - * default=0 */ - char *features; /* comma separated list of required features, default NONE */ - char *groups; /* comma separated list of groups the user can access, - * default set output of "/usr/bin/groups" by API, - * can only be set if user is root */ - uint16_t immediate; /* 1 if job should be initiated immediate or not at all, - * default=0 or queue as needed */ - uint32_t job_id; /* job ID, default set by SLURM */ - char *name; /* name of the job, default "" */ - void *partition_key; /* root key to submit job, format TBD, default NONE */ - uint32_t min_procs; /* minimum processors required per node, default=0 */ - uint32_t min_memory; /* minimum real memory required per node, default=0 */ - uint32_t min_tmp_disk; /* minimum temporary disk required per node, default=0 */ - char *partition; /* name of requested partition, default in SLURM config */ - uint32_t priority; /* relative priority of the job, default set by SLURM, - * can only be explicitly set if user is root, maximum - * value is #fffffffe */ - char *req_nodes; /* comma separated list of required nodes, default NONE */ - char *job_script; /* pathname of required script, default NONE */ - uint16_t shared; /* 1 if job can share nodes with other jobs, 0 otherwise, - * default in SLURM configuration */ - uint32_t time_limit; /* maximum run time in minutes, default is partition - * limit as defined in SLURM configuration, maximum - * value is #fffffffe */ - uint32_t num_procs; /* number of processors required by job, default=0 */ - uint32_t num_nodes; /* number of nodes required by job, default=0 */ - uint32_t user_id; /* set only if different from current UID, default set - * to UID by API, can only be set if user is root */ - uint16_t will_run_test; /* 1 to test of job would begin immediatly, but do not - * actually initiate the job, default=0 or initiate job */ -}; - -int slurm_job_desc_init (structure job_desc *job_info); -int slurm_allocate (structure job_desc *job_info, uint32_t *job_id); -int slurm_job_cancel (uint32_t job_id); /* user_id passed by API */ -int slurm_job_update (structure job_desc *job_info); -int slurm_submit (structure job_desc *job_info, uint32_t *job_id); -int slurm_will_run (structure job_desc *job_info); - -/* The following flags indicated which of the job_desc structure entries were set by - * the user and packed into the communications data structure. These flags are set - * in an uint32_t variable in the communications data structure and followed by - * those values in order. */ -#define JOB_DESC_OPT_CONTIGUOUS 1<<0 -#define JOB_DESC_OPT_FEATURES 1<<1 -#define JOB_DESC_OPT_GROUPS 1<<2 -#define JOB_DEST_OPT_IMMEDIATE 1<<3 -#define JOB_DESC_OPT_JOB_ID 1<<4 -#define JOB_DESC_OPT_JOB_NAME 1<<5 -#define JOB_DESC_OPT_PART_KEY 1<<6 -#define JOB_DESC_OPT_MIN_PROCS 1<<7 -#define JOB_DESC_OPT_MIN_REAL_MEM 1<<8 -#define JOB_DESC_OPT_MIN_TMP_DISK 1<<9 -#define JOB_DESC_OPT_PARTITION 1<<10 -#define JOB_DESC_OPT_PRIORITY 1<<11 -#define JOB_DESC_OPT_REQ_NODES 1<<12 -#define JOB_DESC_OPT_SCRIPT 1<<13 -#define JOB_DESC_OPT_SHARED 1<<14 -#define JOB_DESC_OPT_TIME_LIMIT 1<<15 -#define JOB_DESC_OPT_TOTAL_NODES 1<<16 -#define JOB_DESC_OPT_TOTAL_PROCS 1<<17 -#define JOB_DESC_OPT_SHARED 1<<18 -#define JOB_DESC_OPT_USER_ID 1<<19 -#define JOB_DESC_OPT_WILL_RUN 1<<20 -/* Removed Distribution and ProcsPerTask, they go with a job step only */ - -#define SLURM_SUCCESS 0 /* API call successfully completed */ -#define SLURM_NOCHANGE 1 /* Data requested unchanged since last get call */ -#define SLURM_EAGAIN 2 /* Try again later (eg. no nodes for allocate) */ -#define SLURM_EACCESS 3 /* Permission denied */ -#define SLURM_EINVAL 4 /* Invalid arguments (eg. can never satisfy request) */ -#define SLURM_ENOENT 5 /* No such entry (eg. cancel with bad job_id) */ diff --git a/doc/txt/job.initiation.design.txt b/doc/txt/job.initiation.design.txt deleted file mode 100644 index 478bfaadd43..00000000000 --- a/doc/txt/job.initiation.design.txt +++ /dev/null @@ -1,188 +0,0 @@ -SLURM Job Initiation DRAFT -By Mark Grondona - -Abstract - -Job initiation refers to the process wherein SLURM accepts a request from -a user for a job to run, along with corresponding constraints, and runs -this job on N nodes with M tasks. In this document, Job Initiation will -be expanded to include job termination as well. A job will be considered -terminated when all remote tasks have exited and the nodes have been -made available for another job. - -Design constraints include, but are not limited to, the following: - - o of course: specification of a wide range of options - (e.g. nodes,tasks,tasks/node,min # of cpus, runtime, etc., etc.) - o preservation of current environment - o configurable stdin forwarding (to one, all, or a specified task) - o reliable signal forwarding - o configurable stdout and stderr handling with minimal use of sockets - o graceful and reliable termination of jobs under all conditions - o support for prolog and epilog - o initialize MPI/switch environments (see also - elan.runtime.requirements.txt) - -It is anticipated that the communications library will offer a framework -for easily handling many of these constraints. - -Contents - -1.0 Making a Job Run Request - 1.0.1 Node Allocation and Reservation - 1.0.2 Job Startup - -2.0 Job Request Utility - -3.0 Notes - -4.0 References ----- - -1.0 Making a Job Run Request - -In this document, we will consider a job run to consist of two -main parts: - - 1. Allocation and reservation of nodes - - In this part of job initiation, nodes are assigned to the job - and those nodes are reserved, i.e. cannot be utilized by other - jobs in the system. - - This step necessarily includes authentication and authorization - of the requester. - - 2. Initiation of one or more copies of executable on all or some of - reserved nodes. - - In this second part of job initiation, all or some of the nodes - allocated to the job in step 1 run the user executable in parallel. - - All tasks should pass stdout/err back to the initiating process, - and the initiation process should be able to forward stdin and - signals to all remote processes in a reliable manner. As mentioned - above, the communications layer is expected to be able to - handle this funtionality. - -A third part could be considered job termination. This will be addressed -later. - -1.0.1 Node Allocation and Reservation - -Node allocation and reservation will presumably be handled by the Job -Manager. The requester will utilize the comm layer to send a job run -request to the Job Manager (JM). The JM will reply with an authorization* -and job #, or an error and error cause if the job cannot be run. The job -run request may also specify whether the run request is to be immediate, -or whether the request is merely for an allocation. - -If the request to run is for an allocation, the JM may simply assign -the resources to the job# and pass necessary information back to the -requester. The requester will spawn a shell on the local node which -will have necessary environment variables defined (and necessary -authorization) for subsequent job runs (without a node allocation step) -from within the shell (like RMS's allocate command). - -1.0.2 Job startup - -If the request to run is immediate, then there are two possible designs: - - A. JM spawns a thread/process which requests initiation of M - instantiations of executable on all N assigned nodes. Somehow - the JM will pass back stdout/stdin/stderr file descriptors - to the requester. - - B. The job requester sends initiation request to start all tasks - on assigned nodes. Somehow, authorization for said nodes is - handled (in comm layer?) Comm layer also handles std{in,out,err} - signals/etc. - -The comm layer will make many of the other requirements of this step -easy. There will presumably be some sort of job_run_request structure -that may be passed through the comm layer to all remote nodes that -have been allocated. This run_job_request structure should contain -at least: - - o job id - o uid, gid - o environment - o cwd - o argc, argv - -and possibly (?) - - o task no. (if a separate message is generated for each task) - o some sort of requester info (?) - (can above be handled based upon job id#?) - -The slurmds on the remote nodes will (somehow) be aware that a job has been -authorized to run, and when they receive the job_run_request, will spawn -what is currently called the Job Shepherd (JS). There will most likely be -one JS per task. The JS will handle stdout/err and stdin/signal delivery -to/from the remote requester. It will also monitor the user process and -notify the requester (and/or some other layer?) when the task exits. - ------------ -[*] depending on comm layer design, there may be a need for - some type of authorization key that the job requester will need - in order to proceed to job initiation after the resources - have been allocation. The term authorization will be used as - a placeholder until more specific information is known. - -2.0 Job Request Utility - -The job request utility will provide a tool with which users will be able -to generate a job run request and pass this off to the local slurmd and -job manager. The tool will be very similar to prun and or poe and should -provide at least the following funtionality: - - o required parameters: - - number of tasks - (in which case tasks will be assigned to nodes as - #tasks/node = NumCpus/node) - or - - number of nodes or cpus and - - number of tasks - (in which case cpus may be overcommitted, i.e. more than 1 task/cpu) - - o optional parameters: - - partition to run in - - task distribution method - - stdin redirection - - stdout/err redirection - - resource constraints (See job.manager.design.txt) - - time limit - - label task IO - - retry submission - - submission retry period - - verbosity - - - host lists (?) - - core file type (stack trace or full core) (?) - - allocate only? (or have separate command for this functionality) - - o options may also be provided via environment vars - o environment propagation (including cwd) - o signal forwarding - o stdin forwarding - o stdout/err handling - o job heartbeat (do we need this at user level?) - -The above is just an initial list and further items should be appended. - -3.0 Notes - - - Really need to define who actually sends job_run_request to allocated - nodes. Should this be done from within slurmd or by the job run utility? - Perhaps comm layer design will make this point moot. - - - reliable job termination also needs a design. Perhaps that design should - go into this doc as I had originally thought? Now I am not so sure. - We have to make sure that all resources are freed on job exit, but - also try to make job termination as quick as possible to minimize wasted - resources. Not sure if this is necessarily a tradeoff relationship. - -4.0 References - - diff --git a/doc/txt/job.manager.design.txt b/doc/txt/job.manager.design.txt deleted file mode 100644 index 50c519dd09c..00000000000 --- a/doc/txt/job.manager.design.txt +++ /dev/null @@ -1,69 +0,0 @@ -SLURM Job Manager -Moe Jette - - -Job Submission - -Jobs will be submitted via an API with an execution specification -which is expected to be constructed by a front-end program (e.g. -prun or poe). The specification contains the following fields: - -User The name of the user submitting the job (required) -JobType INTERACTIVE or BATCH (required) - -Job configuration requirements, all optional: -Partition Name of the partition to use. -MaxTime Maximum execution period of the job. This execution may be - provided in more than one piece via gang scheduling (job preemption). - The default value is unlimited, which is indicated with a value of -1. -Distribute Distribution patter of tasks, FILL or ROUND. FILL will place a - full complement of tasks in sequential order on each node before - scheduling the next node. ROUND will distribute tasks in sequential - order one per node before scheduling additional tasks on any node. -CpuCount Total count of CPUs desired -NodeCount Total number of nodes desired -NodeList Explicit comma separated list of desired nodes. This is optional. - If specified, the CPUs parameter will be computed and all partition - configuration specifications must still be satisfied. These nodes need - not be in a single partition. -TaskCount Total count of tasks desired -Contiguous Only contiguous nodes can be allocated to the job, TRUE or FALSE. - Default is FALSE. - -Node configuration requirements, all optional: -MinCpus Minimum acceptable processor count per node -MinSpeed Minimum acceptable processor speed -MinOS Minimum acceptable operating system level -MinTmpDisk Minimum acceptable temporary disk space per node in MegaBytes -MinRealMemory Minimum acceptable real memory per node in MegaBytes -MinVirtualMemory Minimum acceptable real memory per node in MegaBytes -Partition The name of the partition on which the job should execute - - -Node Selection Process: - -IF NodeList provided THEN - IF each node satisfies job configuration requirements - AND each node is available to the user - AND all partition limits satisfied - AND CpuCount limit satisfied - AND NodeCount limit satisfied THEN allocate the nodes - -ELSE IF NodeCount provided THEN - build list of nodes that satisfy job configuration requirements - FOR each node in list - IF node satisfies job configuration requirements - AND node is available to the user - AND all partition limits satisfied - THEN include the node in selection list until NodeCount and CpuCount satisfied - IF NodeCount and CpuCount satisfied THEN allocate the selection nodes - -ELSE IF CpuCount provided THEN - build list of nodes that satisfy job configuration requirements - FOR each node in list - IF node satisfies job configuration requirements - AND node is available to the user - AND all partition limits satisfied - THEN include the node in selection list until CpuCount satisfied - IF NodeCount and CpuCount satisfied THEN allocate the selection nodes - diff --git a/doc/txt/job_data.txt b/doc/txt/job_data.txt deleted file mode 100644 index 8f50150ada6..00000000000 --- a/doc/txt/job_data.txt +++ /dev/null @@ -1,107 +0,0 @@ -Job information to be kept in three tables. -Each table will have its own lock, to maximize parallelism. -We will use the common/list library to manage the tables. - - -JOB TABLE (basic job information) ---------- -Job ID (string) -Job Name (string) -Partition name (string) -User ID (number) -Nodes (string, comma separated list with regular expressions) -State (Pending, Held, Running, Complete, Reserved_Resources, others?) -Time_Limit (minutes) -Start_Time (UTS, Anticipated time if State is Pending, NULL if Held, Actual time otherwise) -End_Time (UTS, Expected if Running, Actual if Complete, NULL otherwise) -Priority (Float) -Key (for Elan connection) -Num_Procs (Actual number of processors allocated to the job) -other?? - -NOTES: -Sort list by Partition then Priority. - -Anticipated start time is difficult to calculate, at least given the need for contiguous nodes. -Initially, I expect to just do FIFO. Later to set the anticpated start time based on node counts -in the partition considering when job's complete.This will be sub-optimal, but with DPCS on top -will work fine. Final version would be true backfill considering all the constraints (Phase 4+). - -Purge entries if End_Time is over some configurable age and the job is complete OR -if we have too many entries. In the event that we have too many entries, there will -probably be a multitude of bad jobs that fail right away for a particular user. -We want to elimate those entries first. - - -JOB DETAILS (needed for job initiation) ------------ -Node_List (required nodes, if any, default NONE) -Num_Procs (Minimum processor count, default NONE) -Num_Nodes (Minimum node count, default NONE) -Features (Required features, default NONE) -Share (Flag if nodes can be shared, default NO) -Contiguous (Flag if contiguous nodes required, default NO) -Min_Procs (Minimum count of processors per node, default NONE) -Min_Mem (Minimum size of real memory per node in MB, default NONE) -Min_TmpDisk (Minimum size of temporary disk space per node in MB, default NONE) -Dist (Distribution of tasks on the nodes, block or cyclic, default ??) -Job_Script (Fully qualified pathname of script to be initiated) -Procs_Per_Task (number of processors required for each task initiated, default 1) -other? - -NOTES: -srun job MUST supply User ID, group names (for partition group filter) and -either Node_List OR Num_Procs OR Num_Nodes. All other parameters are optional. - -srun job may also supply immediate flag (run or fail, not to be queued if set, -default is NOT set). - -Can all of the other state information for a job (environment, stdout, etc.) -all be managed in the Job_Script? Can slurctld just tell slurmd to start -the specified Job_Script? - -Notes for Moe: Dist needs to be added to Parse_Job_Specs. Need to return -Node_List by task, plus IP address list. - - -ACCOUNTING TABLE ----------------- -Job ID (zero if not a SLURM job) -User ID (not redundant for non-SLURM jobs) -CPU_Time_Allocated (computed from Num_Procs and Start_Time in JOB TABLE) -CPU_Time_Used (actual use for job's processes, both system and user time) -Real_Memory_Integral -Virtual_Memory_Integral (DPCS has this, but we may be able to drop it) -Real_Memory_Highwater (Largest real memory seen at any time on any node for the job) -Virtual_Memory_Highwater (DPCS has this, but we may be able to drop it) - -NOTES: -DPCS supports this virtual memory information. Based upon my experience, this -information is virtually worthless. Essentially all of our applications have a -real memory (Resident Set Size) that is almost equal to the virtual memory size. - -CPU time allocated equals the CPUs allocated to the job times its time in State Running. -I am breaking this out here to more easily support gang scheduling and/or checkpoint/restart -in the future. - -The raw information will be collected by slurmd on each node. It will read the -process tables, combine the data by session ID, compute changes since previous -snapshot, assign a Job ID to the records, and hold the data. We need to provide -a Job ID to the session record(s) associated with a running SLURM job. We keep -tallying the data until slurmctld requests it. At that time slurmd send the -totals back and clears the records. slurmd needs to make sure that resource -use does not go backwards and processes may terminate between snapshots. -Since slurmd just sends up deltas, it need preserve no accounting data between -restarts. Only the slurmctld will have actual totals. - -We want to collect resource utilization information on all processes, -not just those associated with a SLURM job. This includes the operating -system, system daemons, and idle time. We also want to report down time. -This not necessarily represent node down time, but slurmd down time. -When the system is stable, these should be the same. We will need to -establish psuedo UIDs for IDLE and DOWN time to distinguish them from -root resource use. - -Users will be charged for reserved resources, even if no processes are running. - -This information will be provided in Phase 4 of SLURM. diff --git a/doc/txt/job_manager_demo.txt b/doc/txt/job_manager_demo.txt deleted file mode 100644 index 19f52de9ddd..00000000000 --- a/doc/txt/job_manager_demo.txt +++ /dev/null @@ -1,113 +0,0 @@ -# The sample node configuration file -linux{jette}504: cat sample.node.conf2 -# -# Sample sample.node.conf2 -# Author: John Doe -# Date: 11/06/2001 -# -Name=DEFAULT OS=Linux.2.4.7-1 CPUs=16 Speed=345.0 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 State=IDLE -# -# lx01-lx02 for login only, no state is DOWN for SLURM initiated jobs -Name=lx01 State=DOWN -Name=lx02 State=DOWN -# -# lx03-lx09 for partitions 1 (debug) and 3 (super) -Name=DEFAULT Partition=1,3 -Name=lx03 -Name=lx04 -Name=lx05 -Name=lx06 -Name=lx07 TmpDisk=4096 -Name=lx08 -Name=lx09 -# -# lx10-lx30 for partitions 0 (pbatch) and 3 (super) -Name=DEFAULT Partition=0,3 -Name=lx10 -Name=lx11 VirtualMemory=2048 -Name=lx12 RealMemory=1024 -Name=lx13 -Name=lx14 CPUs=32 -Name=lx15 -Name=lx16 -Name=lx17 -Name=lx18 State=DOWN -Name=lx19 -Name=lx20 -Name=lx21 -Name=lx22 CPUs=8 -Name=lx23 -Name=lx24 -Name=lx25 -Name=lx26 -Name=lx27 -Name=lx28 -Name=lx29 -Name=lx30 -# -# lx31-lx32 for partitions 4 (class) and 3 (super) -Name=DEFAULT Partition=3,4 -Name=lx31 -Name=lx32 - - -# The sample node configuration file -linux{jette}505: cat sample.part.conf2 -# -# Example sample.part.conf2 -# Author: John Doe -# Date: 12/14/2001 -# -Name=pbatch Number=0 JobType=BATCH MaxCpus=128 MaxTime=UNLIMITED -Name=debug Number=1 JobType=INTERACTIVE MaxCpus=16 MaxTime=60 -Name=super Number=3 JobType=ALL MaxCpus=UNLIMITED MaxTime=UNLIMITED AllowUsers=cdunlap,garlick,jette -Name=class Number=4 JobType=ALL MaxCpus=16 MaxTime=10 AllowUsers=student1,student2,student3 - - - -# Read a sample node and partition configuration -# Select nodes for a collection of job initiation specifications -linux{jette}506: Controller sample.node.conf2 sample.part.conf2 sample.job.conf2 -User=FAILS_Node_Down JobType=INTERACTIVE NodeCount=1 MaxTime=40 NodeList=lx01 -Will_Job_Run: node lx01 does not meet job NodeState specification - Job can not be scheduled at this time, error=13 - -User=jette JobType=INTERACTIVE NodeCount=2 MaxTime=40 NodeList=lx11,lx12 - Job scheduled on this nodes: - lx11,lx12 - -User=anyuser JobType=BATCH NodeCount=2 MaxTime=40 NodeList=lx11,lx12 - Job scheduled on this nodes: - lx11,lx12 - -User=FAILS_JobType JobType=Batch NodeCount=2 MinTmpDisk=1000 MaxTime=40 NodeList=lx07,lx12 -Find_Valid_Parts: Invalid JobType specified - Job can not be scheduled at this time, error=22 - -User=FAILS_Tmp_Disk JobType=BATCH NodeCount=2 MinTmpDisk=8000 MaxTime=40 NodeList=lx07,lx12 -Will_Job_Run: node lx07 does not meet job TmpDisk specification - Job can not be scheduled at this time, error=13 - -User=jette JobType=BATCH NodeCount=4 Partition=pbatch MinRealMemory=2048 - Job scheduled on this nodes: - lx10,lx11,lx13,lx14 - -User=userfor_lx14 JobType=BATCH NodeCount=1 Partition=pbatch MinCpus=32 - Job scheduled on this nodes: - lx14 - -User=userfor_lx10-12 JobType=BATCH NodeCount=2 Partition=pbatch MinVirtualMemory=4000 - Job scheduled on this nodes: - lx10,lx12 - -User=jette JobType=BATCH NodeCount=29 - Job scheduled on this nodes: - lx03,lx04,lx05,lx06,lx07,lx08,lx09,lx10,lx11,lx12,lx13,lx14,lx15,lx16,lx17,lx19,lx20,lx21,lx22,lx23,lx24,lx25,lx26,lx27,lx28,lx29,lx30,lx31,lx32 - -User=anyuser JobType=BATCH NodeCount=4 MinRealMemory=2048 Continguous=FALSE - Job scheduled on this nodes: - lx10,lx11,lx13,lx14 - -User=userforlx13-16 JobType=BATCH NodeCount=4 MinRealMemory=2048 Continguous=TRUE - Job scheduled on this nodes: - lx13,lx14,lx15,lx16 diff --git a/doc/txt/machine.status.demo.txt b/doc/txt/machine.status.demo.txt deleted file mode 100644 index f8d1c52df03..00000000000 --- a/doc/txt/machine.status.demo.txt +++ /dev/null @@ -1,45 +0,0 @@ -# Get system state info, speed in MHz, memory and disk in MB -linux{jette}693: ./Get_Mach_Stat -Name=linux OS=Linux2.4.7-10 CPUs=1 Speed=863.877014 RealMemory=248 VirtualMemory=517 TmpDisk=13511 - - -# A sample node configuration file -linux{jette}695: cat sample.node.conf -# -# Sample sample.node.conf -# Author: John Doe -# Date: 11/06/2001 -# -Name=DEFAULT OS=Linux2.4.7-1 CPUs=2 Speed=1.0 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=1 -# -Name=lx01 Partition= -Name=lx02 -Name=lx03 Speed=1.5 RealMemory=3072 Partition=1,2 -Name=lx04 CPUs=1 Speed=1.3 Partition=1,3 -Name=lx05 -Name=lx06 -# -Name=DEFAULT OS=Linux3.0 CPUs=4 Speed=1.6, Partition=9 -Name=mx01 -Name=mx02 Pool=5 RealMemory=567 - - -# Read configuration and exercise a variety of operations (write, update, save state, etc.) -linux{jette}696: ./Mach_Stat_Mgr sample.node.conf tmp.1 tmp.2 -Show_Node_Record: Name=mx03 OS=UNKNOWN CPUs=4 Speed=1.000000 RealMemory=0 VirtualMemory=0 TmpDisk=16384 Partition=0,9 State=IDLE LastResponse=123 - - -# Full dump of node configuration and state -linux{jette}697: cat tmp.1 -# -# Written by SLURM: Fri Dec 14 15:00:24 2001 -# -Name=lx01 OS=Linux2.4.7-1 CPUs=2 Speed=1.000000 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=0 State=UNKNOWN, LastResponse=0 -Name=lx02 OS=Linux2.4.7-1 CPUs=2 Speed=1.000000 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=0,1 State=UNKNOWN, LastResponse=0 -Name=lx03 OS=Linux2.4.7-1 CPUs=2 Speed=1.500000 RealMemory=3072 VirtualMemory=4096 TmpDisk=16384 Partition=0,1,2 State=UNKNOWN, LastResponse=0 -Name=lx04 OS=Linux2.4.7-1 CPUs=1 Speed=1.300000 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=0,1,3 State=UNKNOWN, LastResponse=0 -Name=lx05 OS=Linux2.4.7-1 CPUs=2 Speed=1.000000 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=0,1 State=UNKNOWN, LastResponse=0 -Name=lx06 OS=Linux2.4.7-1 CPUs=2 Speed=1.000000 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=0,1 State=UNKNOWN, LastResponse=0 -Name=mx01 OS=Linux3.0 CPUs=3 Speed=1.600000 RealMemory=2048 VirtualMemory=4096 TmpDisk=12345 Partition=0,9 State=UNKNOWN, LastResponse=0 -Name=mx02 OS=Linux3.0 CPUs=4 Speed=1.600000 RealMemory=567 VirtualMemory=4096 TmpDisk=16384 Partition=0,9 State=UNKNOWN, LastResponse=0 -Name=mx03 OS=UNKNOWN CPUs=4 Speed=1.000000 RealMemory=0 VirtualMemory=0 TmpDisk=16384 Partition=0,9 State=IDLE, LastResponse=123 diff --git a/doc/txt/machine.status.design.txt b/doc/txt/machine.status.design.txt deleted file mode 100644 index ecf30c05697..00000000000 --- a/doc/txt/machine.status.design.txt +++ /dev/null @@ -1,298 +0,0 @@ -SLURM Machine Status Infrastructure -November 26, 2001 -By Moe Jette - - -Abstract - -The purpose of SLURM's machine status infrastructure is to configure -and monitor the state of nodes in the cluster. -This document describes the phase one Machine Status implementation, -which includes only basic node status information. -More machine state information and job state information will be -incorporated in phases two and three. -The Machine Status Manager (MSM) will execute on the SLURM Control -Machine and record the configuration of each node along with its -latest reported state. -This information will be available for viewing and/or modification -using APIs and a Machine Status Tool (MST). -The Machine Status Daemon (MSD) will execute on each SLURM node -and report the state to the MSM. - - -Machine Status Manager - -The Machine Status Manager (MSM) is responsible for maintaining a -configuration record for each node in the cluster. -MSM will have a configuration file identifying a variety of parameters. -The location of this file will be provided by the symbolic link at -"/etc/SLURM.cfg". -There will be default values for most parameters if not specified. -Lines in the configuration file having "#" in column one will be -considered comments. -Parameters used by the machine status include: -ControlMachine The name of the machine where control functions operate -CollectorNodes Comma separated list of nodes which can server to - collect messages and combine the data so as to - reduce network traffic (Note: This design feature is unique - to SLURM and DPCS, it offers vastly improved scalability, - default is none, specify comma separated list of node - names as desired) -NodeSpecConf Fully qualified pathname of the file containing node - configuration information as described below (default - "/usr/local/SLURM/NodeSpecConf") -PartitionConf Fully qualified pathname of the file containing partitoin - configuration information as described below (default - "/usr/local/SLURM/PartitionConf") -DefaultPartition Name of the default partition -MachStatusManager The qualified pathname of the file containing the Machine - Status Manager (default "/usr/local/SLURM/MachStatusManager") -MachStatusDaemon The qualified pathname of the file containing the Machine - Status Daemon (default "/usr/local/SLURM/MachStatusDaemon") -MachStatusPort The port to be used for the Machine Status Manager and - Machine Status Daemon communications. Should be privileged - port (acquired only by user root). -MachStatusDebug A list of debug flags separated by commas (default is - minimal logging, example "init,msg") -HeartBeatInterval Seconds between node status reports (default is "300") -HeartBeatTimeout If last node status report is at least this number - of seconds ago, node is considered "Down" (default is "600") -Only one parameter should be specified per line with the parameter's name, -an equal sign, and the value. -White space is ignored. -A sample SlurmConf file is included at the end of this document. - - -MSM will maintain the following information about each node: - 1 Name Name of a node as returned by hostname (e.g. "lx12") - 2 OS Operating System name and level (output of the command - "/bin/uname -s -r | /bin/sed 's/ /./g'") - 3 CPUs Number of processors (e.g. "2") - 4 Speed Relative speed of these CPUs, units can be an arbitrary - floating point number, but MHz value is recommended - (e.g. "863.8") - 5 RealMemory Size of real memory in MegaBytes (e.g. "2048") - 6 VirtualMemory Size of virtual memory in MegaBytes (e.g. "4096") - 7 TmpDisk Size of temporary disk storage in MegaBytes (e.g. "16384") - 8 Partition List of partition numbers (collections of nodes) this node - belongs to, partition numbers range from 0 to 31 and are - specified with comma separators (e.g. "1,3"). - 9 LastResponse Time of last contact from node, format is time_t as - returned by the "time" function -10 State State of node (e.g. IDLE, BUSY, DRAINING, DRAINED, DOWN) - -Only the first item, Name, must be supplied in the configuration file. -Items two through eight can initially be read from the configuration file -referred to by "NodeSpecConf" or can be established through communications -with the Machine Status Daemon (MSD). -Items nine and higher are established through communications with other -SLURM components, primarily the Machine Status Daemon (MSD). -The "Partition" specification will only come into play when SLURM starts to -schedule resources and initiate jobs, in phase two of this project. -If not otherwise specified, all nodes will be in partition zero. -If a node is not to be included in any partition, indicate this with the -expression "Partition= ". -Change the value of MAX_PARTITION at SLURM build time to change the maximum -partition value if so desired. -Lines in the configuration file having "#" in column one will be -considered comments. -The configuration file should contain information about one node on -a single line. -Each field should contain the field's name, an equal sign, and the value. -Fields should be space or tab separated. -The default values for each node can be specified with a record in which -"Name" is "DEFAULT". -The default entry values will apply only to lines following it in the -configuration file and the default values can be reset multiple times -in the configuration file with multiple entries where "Name" is "DEFAULT". -Any node with less resources or a lower release level than specified -will have the event logged and will not be configured for executing -user jobs. -In order to support the concept of jobs requiring consecutive nodes, -nodes should be place in this file in consecutive order. -The size of any field in the configuration file is limited to 1024 characters. -A sample NodeSpecConf file is included at the end of this document. - -In order to simplify the building and support of SLURM, we will not -incorporate a database into MSM at this time (if ever). -The data can easily be kept in a C structure and written to a file -for backup. -This design also provides much greater speed than would be possible -with a database. - -The operation of MSM will be as follows: - 1. Read the SlurmConf file and confirm its integrity, log results - 2. Read the NodeSpecConf file and confirm its integrity, log results - 3. Contact the Machine Status Daemon for each of the CollectorNodes - to confirm each node is running. In the request, specify a - HeartBeatInterval of zero indicating daemon should report status - once and await further instructions. - 4. Divide the list of nodes which are *not* CollectorNodes into - lists of similar size with one list for each *responding* - CollectorNode. Save this mapping for fault-tolerance - 5. Send a message to the Machine Status Daemon on each of the - *responding* CollectorNodes with the configured HeartBeatInterval - and a node list. Each of these CollectorNodes will forward - the request to each of the listed nodes, collect all responses - along with its own, and reply in a single message to the MSM. - 6. If a non-CollectorNode fails to respond within the HeartBeatTimeout - mark this node as "Down". - 7. If a CollectorNode fails to respond within the HeartBeatTimeout - mark this node as "Down", redistribute its list of nodes - among other CollectorNodes and resume from step five above. - 8. "Down" nodes should be tested for response periodically. The MSM - should test each for response on a "convenient" basis. -9A. If a non-CollectorNode responds, add it to one of the CollectorNodes - lists and resume from step five above. -9B. If a CollectorNode responds, rebalance the CollectorNodes lists, - and resume from step five above. Note that a Machine Status Daemon - must first be told to stop responding to one CollectorNode before - starting to respond to another one. - -The MSM shall accept API requests on the MachStatusPort to set or get -machine status information. -Only user root will be permitted to set machine status information -and authentication will be provided by virtue of the low port number. -The most common set machine status operation is expected to be making -a machine state "Up" after individual nodes are restarted. -The API shall permit any user to get system information. -It will be possible to get state information for individual machines -or all machines. -It will also be possible to get only specific fields on those nodes, -for example "Get Name and State for all nodes". -We do not expect to provide support for SQL queries, the filtering -and other processing will be the responsibility of the application. - -We will provide a simple command-line interface to MSM utilizing -the API described above. -We anticipate providing this tool with support for: -1. Identifying fields to be reported -2. Identifying the machines to have state reported -3. Sorting on specific fields -4. Filtering on specific field values (e.g. "State=DOWN") - - -Machine Status Daemon - -The Machine Status Daemon (MSD) at this point in time will only -confirm that the daemon is active and the node functioning. -MSD will accept one argument, the HeartBeatInterval. -A zero HeartBeatInterval indicates the program should -report status once and await further instructions. -A non-zero HeartBeatInterval indicates the program should -report status, sleep for the specified interval, then repeat. -CollectorNodes will be given a list of other nodes to -correspond with. -The MSD must open connections with MSDs on the specified -nodes and forward the requests. -The MSD collects responses as they arrive and combine them -into a single message buffer. -When all nodes have responded or when its HeartBeatInterval -is exceeded by 50 percent, send the collected responses. - - -Notes - -It is advisable to start the ControlMachine before any other -of the cluster's nodes. - -There is no necessity for synchronized clocks on the nodes -(unlike LoadLeveler). - -The hierarchical communications with CollectorNodes provide -excellent scalability (unlike LoadLeveler). - -I am assuming that all nodes will have the same daemons -running with the exception of the ControlMachine. -The ControlMachine will direct each node to operate in a -different fashion as needed. - -Fault-tolerance will be built through mechanisms to save -and restore the database using local and global file systems. - -Do we want to special case nodes going down by running some -script that might send e-mail? - -We need more complete documentation (e.g. man pages for all components). - -Do we want to specify fully qualified names for all machines or -specify a domain name for conciseness? - -We need to discuss fault-tolerance, which requires the communications -library design work. - - -Sample SlurmConf - -# -# Example SlurmConf -# Author: John Doe -# Date: 11/06/2001 -# -ControlMachine = lx_control -CollectorNodes = lx01,lx02 -NodeSpecConf = /usr/local/SLURM/NodeSpecConf -PartitionConf = /usr/local/SLURM/PartitionConf -# -MachStatusManager = /usr/local/SLURM/MachStatusManager -MachStatusDaemon = /usr/local/SLURM/MachStatusDaemon -MachStatusPort = 612 -MachStatusDebug = init,msg -# -HeartBeatInterval = 300 -HeartBeatTimeout = 600 - - -Sample NodeSpecConf - -# -# Example NodeSpecConf -# Author: John Doe -# Date: 11/06/2001 -# -Name=DEFAULT OS=Linux.2.4.7-10 CPUs=2 Speed=1.0 RealMemory=2048 VirtualMemory=4096 TmpDisk=16384 Partition=1 -# -Name=lx01 -Name=lx02 -Name=lx03 Speed=1.5 RealMemory=3072 Partition=1,2 -Name=lx04 CPUs=1 Speed=1.3 Partition=1,3 -Name=lx05 -Name=lx06 - - -Dependencies - -The Communications Library is required. - - -Module Testing - -Test Machine Status Daemon with various time intervals including invalid values -(see HeartBeatInterval). -Test Machine Status Daemon with various file location names including invalid names -(see MachStatusDaemon). -Test Machine Status Daemon with various port numbers including invalid values -(see MachStatusPort). -Test Machine Status Daemon with debug options including invalid values -(see MachStatusDebug). -Review logs from above tests. - - - -Integration and System Testing - -Test Machine Status Manager fault-tolerance with nodes being dropped. -Test Machine Status Manager with and without CollectorNodes (various counts). -Test Machine Status Manager with various NodeSpecConf specifications -(with and without various defaults). -Test Machine Status Manager from various file location names including invalid names -(see MachStatusManager). -Test Machine Status Manager with debug options including invalid values -(see MachStatusDebug). -Test Machine Status Manager with various failure modes on both CollectorNodes -and normal compute nodes. -Test Machine Status Manager API with all options both from a privileged port -and a non-previlged port (non-root user). -Review logs from above tests. - -Test Machine Status Tool, all options in various combinations per man pages. diff --git a/doc/txt/message.summary.txt b/doc/txt/message.summary.txt deleted file mode 100644 index 272b999abbd..00000000000 --- a/doc/txt/message.summary.txt +++ /dev/null @@ -1,256 +0,0 @@ -SLURM message traffic summary - -Below is a very terse outline of possible message traffic that may occur -across the various components of slurm. These messages will most likely -form a REQ/REPLY pair, where - following Moe's convention - the Input is -contained in the `request' message, and the Output will be found in the -`reply.' - -Command(s): Get job/accounting/node/partition/job_step/build information, - separate API for each data type -Client: squeue and scontrol commands, plus DPCS from API, any node in cluster -Server: slurmctld -Input: time-stamp, version, user id - flags : might be useful for filtering data sent, e.g. just this user's jobs -Output: error code, version, time-stamp, record count, array of records -Notes: most information generally available, some might be restricted by user id - - -Command(s): Get partition_key -Client: API call (used by DPCS) -Server: slurmctld -Input: uid (must be root) -Output: partition_key -Notes: used to control access to some partitions. for example, any user - can run jobs in the "batch" partition, but only when initiated by - a batch controller (e.g. DPCS). this prevents users from running - jobs outside of the queue structure - - -Command(s): Allocate job -Client: srun or slurm api call -Server: slurmctld -Input: username/uid,nnodes,ntasks, group - optional: partition,time_limit,constraints,features,node list, partition_key - flags : wait_for_resources, test only (don't allocate resources, - just reply whether or not allocate would have succeeded, - used by DPCS) -Output: job_id, return code, error code, node list, ncpus for *each* node in list, - job_key -Notes: allocate resources to a ``job'' - - -Command(s): Claim job allocation -Client: srun -Server: slurmctld -Input: uid, job_id, job_key -Output: error_code -Notes: ties allocation to a specific process_id, used to determine when a - job is really complete - - -Command(s): Submit job -Client: srun or slurm api call -Server: slurmctld -Input: Allocate input + script path, environment, cwd - optional: partition, time_limit, constraints, features, - I/O location, signal handling, partition_key - flags: -Output: job_id, return code, error code -Notes: submit a batch job to the slurm queue - - -Command(s): Register Job Step -Client: srun or slurm api call -Server: slurmctld -Input: job_id,username/uid - optional: nnodes,ntasks,cpus_per_task,distribution,time_limit, - constraints,features,signal handling - flags : wait_for_resources -Output: job_id, step_id, return code, error code, node list, ncpus/node - credential list, -Notes: run a set of parallel tasks under an allocated job - allocate resources if job_id < MIN_JOBID, otherwise assume - resources are already available - - -Command(s): Get Job Step Information -Client: srun, scancel -Server: slurmctld -Input: job_id, step_id, uid -Output: return code, error code, node list, ncpus/node(?) -Notes: Obtain the list of resources assigned to a currently executing - job step. Needed by at least `srun --attach`, and scancel. - uid must match that of job_id or be root - - -Command(s): Run Job Step Request -Client: srun or slurmctld -Server: slurmd -Input: username/uid, job_id, step_id, credential, ntasks, environment, - cwd, command line, stdin location, stdout/err location -Output: return code, error code -Notes: request initiation of ntasks tasks on this node. - - -Command(s): Signal Job Step by Node Request -Client: srun or slurmctld (or possibly scancel) -Server: slurmd -Input: uid, signal no., job_id - optional: step_id, task no. -Output: return code -Notes: Signal all steps and all tasks unless otherwise specified. - This could be used to support gang scheduling - Cancel all steps unless otherwise specified. - - -Command(s): Signal Job Step by Cluster Request -Client: scancel user command, plus DPCS from API, any node in cluster -Server: slurmctld -Input: uid, signal no., job_id - optional: step_id, task no. -Output: return code -Notes: Can only be run as user root or the user id for the job - Cancel all steps unless otherwise specified. - - -Command(s): Job Step Attach Request -Client: srun -Server: slurmd -Input: uid, job_id, step_id -Output: return code, error code, - stdout/err duplicated to srun stdout/err, signals propagated, -Notes: srun process ``attaches'' to a currently running job. This - request is used for srun recovery, or by a user who wants - to interactively reattach to a batch job. - - -Command(s): Reconfigure (tell slurmctld to re-read configuration) -Client: scontrol administrator command, any node in cluster -Server: slurmctld -Input: user id (root), version -Output: error code, version -Notes: can only be run as user root - - -Command(s): Register node -Client: slurmd daemon, any node in cluster -Server: slurmctld -Input: version, time stamp, processor count, memory size, temporary disk space -Output: none -Notes: Done when slurmd restarts - - -Command(s): Report node status -Client: slurmctld or backup slurmctld -Server: slurmd or slurmctld (for backup check) daemon, any node in cluster -Input: none -Output: version, time stamp, processor count, memory size, temporary disk space -Notes: - - -Command(s): Upload accounting information -Client: slurmctld -Server: slurmd daemon, any node in cluster -Input: none -Output: version, time stamp, collection of records with user id, memory use, - CPU time used, CPU time allocated, etc. -Notes: not needed for initial release - - -Command(s): Get job id from process id -Client: DPCS API -Server: slurmd daemon on the same node as DPCS API is executed -Input: process id -Output: SLURM job id -Notes: Until SLURM accounting is fully funcational, DPCS needs help figuring - out what processes are associated with each job. All message traffic - within a node - - -Command(s): Get job step infomration -Client: srun from user script, slurmd, any node in cluster -Server: slurmctld -Input: job id, uid, step id -Output: step id, elan context (opaque data structure) -Notes: needed to start parallel program - - -Command(s): Modify node information -Client: scontrol -Server: slurmctld -Input: node name, node state, uid -Output: exit code -Notes: Only the node state can be changed (esp. to DRAINING, DOWN, IDLE). - Only request from user root is accepted - -Command(s): Modify partition information -Client: scontrol -Server: slurmctld -Input: partition name, allowed groups, default, key, max time, max nodes, - node list, shared, state, uid -Output: exit code -Notes: Only request from user root is accepted. - - -Command(s): Modify job information -Client: scontrol -Server: slurmctld -Input: job id, time limit, priority, uid (must be root or owner -Output: exit code -Notes: Only request from the job's owner and user root are accepted. - - -Command(s): Run epilog -Client: slurmctld -Server: slurmd -Input: job_id -Output: error code -Notes: On termination of a job (not the job step), slurmctld tells - slurmd to execute its epilog program (if any). - - -Summary of interactions: -dpcs->slurmd Get job id from process id - -scancel->slurmctld Signal Job Step by Cluster Request - -scancel->slurmd Signal Job Step by Node Request - -scontrol->slurmctld Reconfigure - Modify job information - Modify node information - Modify partition information - Get job/accounting/node/partition/job_step/build information - -slurmctld->slurmctld Report node status (backup to primary controller) - -slurmctld->slurmd Signal Job Step by Node Request - Report node status - Run epilog - Run Job Step Request - Upload accounting information - Signal Job Step by Node Request - -slurmd->slurmctld Get job step information - Register node - -srun->slurmctld Get job step infomration - Job Step Attach Request - Get Job Step Informationt - Register Job Step - Submit job - Allocate job - Claim job allocation - -srun->slurmd Kill Job Step Request - Signal Job Step by Node Request - Run Job Step Request - -----TEMPLATE---- -Command(s): -Client: -Server: -Input: -Output: -Notes: diff --git a/doc/txt/misc.issues.txt b/doc/txt/misc.issues.txt deleted file mode 100644 index fd816a4aa44..00000000000 --- a/doc/txt/misc.issues.txt +++ /dev/null @@ -1,16 +0,0 @@ -Requirements Discussion 12/25/2001, Chris, Jim, Py Watson: -- Atomic needed for DPCS willrun - schedule - run operations. -- schedule debug windows (ratchet down limits) / include interactive -- burst over limits when nodes available but mark jobs as killable -- haiku error messages (see perl module) -- speed -- no core by default, core "lite", overrideable, core.hostname.pid, - core directory to avoid contention no creating directory -- preemption (SIGSTOP job in the way), EINTR issues, mark "non-preemptable" -- checkpoint/restart -- refer to purple RFP comments on batch systems - -Meeting 1/4/2001, Moe, Mark G., Jim, Chris: -- Elan needs contiguous nodes for hardware broadcast; achieving this - is incompatible with DPCS's scheduling algorithm. RMS uses box packing. -- Need DPCS constraint for contiguous nodes, passed in with willrun query. diff --git a/doc/txt/partition.demo.txt b/doc/txt/partition.demo.txt deleted file mode 100644 index 71874aa1955..00000000000 --- a/doc/txt/partition.demo.txt +++ /dev/null @@ -1,29 +0,0 @@ -# Sample partition configuration file -# -# Example sample.part.conf -# Author: John Doe -# Date: 12/14/2001 -# -Name=DEFAULT JobType=Batch -# -Name=DEFAULT JobType=INTERACTIVE MaxCpus=16 -Name=pbatch Number=0 JobType=BATCH MaxCpus=128 MaxTime=UNLIMITED -Name=debug Number=1 MaxCpus=4 MaxTime=60 -Name=super Number=3 JobType=BATCH MaxCpus=256 AllowUsers=cdunlap,garlick,jette -Name=class Number=4 JobType=ALL MaxCpus=UNLIMITED MaxTime=60 AllowUsers=student1,student2,student3 - - -# Executed assorted partition tests: read, write, update, select available partitions for pending job, etc. -Partition_Mgr sample.part.conf tmp.1 -Show_Part_Record: Name=test Number=10 JobType=ALL MaxTime=UNLIMITED MaxCpus=UNLIMITED State=DOWN DenyUsers=non_tester - - -# Sample partition configuration file written by SLURM after updates -# -# Written by SLURM: Tue Dec 18 12:48:22 2001 -# -Name=pbatch Number=0 JobType=BATCH MaxTime=UNLIMITED MaxCpus=128 State=UP DenyUsers=student1 -Name=debug Number=1 JobType=INTERACTIVE MaxTime=60 MaxCpus=4 State=UP -Name=super Number=3 JobType=BATCH MaxTime=UNLIMITED MaxCpus=256 State=UP AllowUsers=cdunlap,garlick,jette -Name=class Number=4 JobType=ALL MaxTime=60 MaxCpus=UNLIMITED State=UP AllowUsers=student1,student2,student3 -Name=test Number=10 JobType=ALL MaxTime=UNLIMITED MaxCpus=UNLIMITED State=DOWN DenyUsers=non_tester diff --git a/doc/txt/partition.design.txt b/doc/txt/partition.design.txt deleted file mode 100644 index 3a18b0c44b0..00000000000 --- a/doc/txt/partition.design.txt +++ /dev/null @@ -1,207 +0,0 @@ -SLURM Partition Management Infrastructure -December 14, 2001 -By Moe Jette - - -Abstract - -The purpose of SLURM's partition management infrastructure is to configure -and monitor the state of partitions in the cluster. -The Partition Manager (PM) will execute on the SLURM Control -Machine and record the configuration of each node along with its -latest reported state. -The Partition Manager will actually be a component within the -MachStatusManager program (along with the Machine Status Manager). -This information will be available for viewing and/or modification -using APIs and a Machine Status Tool (MST, which also manages node -information). - -Partition Manager - -The Partition Manager (PM) is responsible for maintaining a -configuration record for each partition in the cluster. -PM will have a configuration file identifying a variety of parameters. -The location of this file will be provided by the symbolic link at -"/etc/SLURM.cfg". -There will be default values for most parameters if not specified. -Lines in the configuration file having "#" in column one will be -considered comments. -Parameters used by the machine status include: -ControlMachine The fully qualified name of the machine where control - functions operate -CollectorNodes Comma separated list of nodes which can server to - collect messages and combine the data so as to - reduce network traffic (Note: This design feature is unique - to SLURM and DPCS, it offers vastly improved scalability, - default is none, specify comma separated list of fully - qualified node names as desired) -NodeSpecConf Fully qualified pathname of the file containing node - configuration information as described below (default - "/usr/local/SLURM/NodeSpecConf") -PartitionConf Fully qualified pathname of the file containing partitoin - configuration information as described below (default - "/usr/local/SLURM/PartitionConf") -DefaultPartition Name of the default partition -MachStatusManager The qualified pathname of the file containing the Machine - Status Manager (default "/usr/local/SLURM/MachStatusManager") -MachStatusDaemon The qualified pathname of the file containing the Machine - Status Daemon (default "/usr/local/SLURM/MachStatusDaemon") -MachStatusPort The port to be used for the Machine Status Manager and - Machine Status Daemon communications. Should be privileged - port (acquired only by user root). -MachStatusDebug A list of debug flags separated by commas (default is - minimal logging, example "init,msg") -HeartBeatInterval Seconds between node status reports (default is "300") -HeartBeatTimeout If last node status report is at least this number - of seconds ago, node is considered "Down" (default is "600") -Only one parameter should be specified per line with the parameter's name, -an equal sign, and the value. -White space is ignored. -A sample SlurmConf file is included at the end of this document. - - -PM will maintain the following information about each partition: - 1 Name Name by which the partition may be referenced (e.g. "Interactive") - 2 Number Unique number by which the partition can be referenced - 3 JobType Jobs which may execute in the partition, default is "ALL" - (e.g. BATCH, INTERACTIVE, ALL) - 4 MaxTime Maximum wall-time limit for any job in minutes, - default value is "UNLIMITED" - 5 MaxCpus Maximum count of CPUs which may be allocated to any job, - default value is "UNLIMITED" - 7 State State of partition (e.g. UP or DOWN), - default value is "UP" - 8 AllowUsers Names of user who may use the partition, - separated by commas, default value is "ALL" - 9 DenyUsers Names of user who may not use the partition, - separated by commas, default value is "NONE" - - -Only the first two items, Name and Number, must be supplied in the configuration file. -If not otherwise specified, all nodes will be in partition zero. -Lines in the configuration file having "#" in column one will be -considered comments. -The configuration file should contain information about one partition on -a single line. -Each field should contain the field's name, an equal sign, and the value. -Fields should be space or tab separated. -The default values for each partition can be specified with a record in which -"Name" is "DEFAULT" if other default values are prefered. -The default entry values will apply only to lines following it in the -configuration file and the default values can be reset multiple times -in the configuration file with multiple entries where "Name" is "DEFAULT". -The size of any field in the configuration file is limited to 1024 characters. -If user controls are desired then set either AllowUsers or DenyUsers, but not both. -If AllowUsers is set, then DenyUsers is ignored. -If DenyUsers is set, then AllowUsers is ignored. -A sample PartitionConf file is included at the end of this document. - -In order to simplify the building and support of SLURM, we will not -incorporate a database into PM at this time (if ever). -The data can easily be kept in a C structure and written to a file -for backup. -This design also provides much greater speed than would be possible -with a database. - -The operation of MP will be as follows: - 1. Read the SlurmConf file and confirm its integrity, log results - 2. Read the PartitionConf file and confirm its integrity, log results - -The MP shall accept API requests on the MachStatusPort to set or get -partition status information. -Only user root will be permitted to set partion status information -and authentication will be provided by virtue of the low port number. -The API shall permit any user to get system information. -It will be possible to get state information for individual partions -or all partions. -We do not expect to provide support for SQL queries, the filtering -and other processing will be the responsibility of the application. - -We will provide a simple command-line interface to PM utilizing -the API described above. -We anticipate providing this tool with support for: -1. Identifying fields to be reported -2. Identifying the partions to have state reported -3. Sorting on specific fields -4. Filtering on specific field values (e.g. "State=DOWN") - - -Notes - -It is advisable to start the ControlMachine before any other -of the cluster's nodes. - -There is no necessity for synchronized clocks on the nodes -(unlike LoadLeveler). - -The hierarchical communications with CollectorNodes provide -excellent scalability (unlike LoadLeveler). - -I am assuming that all nodes will have the same daemons -running with the exception of the ControlMachine. -The ControlMachine will direct each node to operate in a -different fashion as needed. - -Fault-tolerance will be built through mechanisms to save -and restore the database using local and global file systems. - -We need more complete documentation (e.g. man pages for all components). - -We need to discuss fault-tolerance, which requires the communications -library design work. - - -Sample SlurmConf - -# -# Example SlurmConf -# Author: John Doe -# Date: 11/06/2001 -# -ControlMachine = lx_control.llnl.gov -CollectorNodes = lx01.llnl.gov,lx02.llnl.gov -NodeSpecConf = /usr/local/SLURM/NodeSpecConf -PartitionConf = /usr/local/SLURM/PartitionConf -# -MachStatusManager = /usr/local/SLURM/MachStatusManager -MachStatusDaemon = /usr/local/SLURM/MachStatusDaemon -MachStatusPort = 612 -MachStatusDebug = init,msg -# -HeartBeatInterval = 300 -HeartBeatTimeout = 600 - - -Sample PartitionConf - -# -# Example PartitionConf -# Author: John Doe -# Date: 12/14/2001 -# -Name=DEFAULT JobType=Batch -# -Name=pbatch Number=0 JobType=BATCH MaxCpus=128 -Name=debug Number=1 JobType=INTERACTIVE MaxCpus=4 MaxTime=60 -Name=super Number=3 JobType=BATCH MaxCpus=256 AllowUsers=cdunlap,garlick,jette -Name=class Number=4 JobType=ALL MaxCpus=16 AllowUsers=student1,student2,student3 - - -Dependencies - -The Communications Library is required. -This code is a component of the Machine Status Manager, which must be completed first. - - -Module Testing - -Test Partition Manager with various PartitionConf specifications -(with and without various defaults). -Test Machine Status Manager API with all options both from a privileged port -and a non-previlged port (non-root user). -Review logs from above tests. - - -Integration and System Testing - -Test Machine Status Tool, all options in various combinations per man pages. diff --git a/doc/txt/slurm.protocol.txt b/doc/txt/slurm.protocol.txt deleted file mode 100644 index 25fd96aa0a3..00000000000 --- a/doc/txt/slurm.protocol.txt +++ /dev/null @@ -1,559 +0,0 @@ -Title: SLURM Protocol Definition -Author: Kevin Tew (tew1@llnl.gov) -Date: May 15, 2002 - -Section 1 Overview: - The SLURM Protocol defines a standard message passing interface -between the server, client, and utility modules of SLURM. The protocol -consists of mostly asyncronous request response messages pairs. The -underlying transport protocol will provide connectionless yet reliable and -secure transpoirt. A few messages such as node registration, accounting -information update, are singleton messages without a corresponding request or -response. - -Section 2 Protocol Definition: - - SLURM Header - 0 1 2 3 - -------------------------------------------------- - | Version |Flags | - -------------------------------------------------- - |Message Type | - -------------------------------------------------- - |Body Length (Bytes) | - -------------------------------------------------- - - - MessageTypes - RESGISTRATION MESSAGES - Node Registation - 1 REQUEST_NODE_REGISRATION_STATUS - 2 MESSAGE_NODE_REGISRATION_STATUS - This message will also occur without a corresponding request when a slurmd first somes up. - - - RESOURCE ALLOCATION MESSAGES - Resource Allocation / Inquiry - 3 REQUEST_RESOURCE_ALLOCATION - 4 RESPONSE_RESOURCE_ALLOCATIN - - Batch Job Submission - 22 REQUEST_SUBMIT_BATCH_JOB - 23 RESPONSE_SUBMIT_BATCH_JOB - - Run Batch Job on Resource Allocation - 30 REQUEST_BATCH_JOB_LAUNCH - 31 RESPONSE_BATCH_JOB_LAUNCH - To be used by DPCS to launch job once resources have become available. - Ask Moe about this one and DPCS job launch. - - - JOB MESSAGES - Signal Job - 26 REQUEST_SIGNAL_JOB - 27 RESPONSE_SIGNAL_JOB - - Cancel Job - 5 REQUEST_CANCEL_JOB - 7 RESPONSE_CANCEL_JOB - - - JOB STEP MESSAGES - - Create Job Step - 35 REQUEST_CREATE_JOB_STEP - 36 RESPONSE_CREATE_JOB_STEP - - Get Job Step Info - 44 REQUEST_GET_JOB_STEP_INFO - 45 RESPONSE_GET_JOB_STEP_INFO - - Job Resource Request - 46 REQUEST_JOB_RESOURCE - 47 RESPONSE_JOB_RESOURCE - - Template - ??? Job Step - ?? REQUEST__JOB_STEP - ?? RESPONSE__JOB_STEP - - Run Job Step - 37 REQUEST_RUN_JOB_STEP - 38 RESPONSE_RUN_JOB_STEP - - Signal Job Step - 28 REQUEST_SIGNAL_JOB_STEP - 29 RESPONSE_SIGNAL_JOB_STEP - - Cancel Job Step - 24 REQUEST_CANCEL_JOB_STEP - 25 RESPONSE_CANCEL_JOB_STEP - - - RECONFIGURE MESSAGES - Reconfigure - 6 REQUEST_RECONFIGURE - 21 RESPONSE_RECONFIGURE - - - INFO MESSAGES - Job Info - 8 REQUEST_JOB_INFO - 9 RESPONSE_JOB_INFO - - Job Step Info - 10 REQUEST_JOB_STEP_INFO - 11 RESPONSE_JOB_STEP_INFO - - Node Info - 12 REQUEST_NODE_INFO - 13 RESPONSE_NODE_INFO - - Partition Info - 14 REQUEST_PARTITION_INFO - 15 RESPONSE_JOB_INFO - - Accounting Info - 16 REQUEST_ACCTING_INFO - 17 RESPONSE_ACCOUNTING_INFO - - Build Info - 18 REQUEST_BUILD_INFO - 19 RESPONSE_BUILD_INFO - - - ACCOUNTING UPLOAD MESSAGES - Upload Accounting - 20 MESSAGE_UPLOAD_ACCOUNTING_INFO - - - TASK MESSAGES - Launch Tasks - 34 REQUEST_LAUNCH_TASKS - 41 RESPONSE_LAUNCH_TASKS - Task Exit - 32 MESSAGE_TASK_EXIT - - - CREDENTIAL MESSAGES - Revoke Credential - 33 MESSAGE_REVOKE_JOB_CREDENTIAL ?? - - - JOB ATTACH MESSAGES - 39 REQUEST_JOB_ATTACH - 40 RESPONSE_JOB_ATTACH - - DPCS KEY MESSAGES - 42 REQUEST_GET_KEY - 43 RESPONSE_GET_KEY - - - Message Bodies: - - Types of Message Bodies - REQUEST: to be followed by a RESPONSE - RESPONSE: to follow a REQUEST - MESSAGE: a single message not requireing a response - - MessageType 1 REQUEST_NODE_REGISTRATION_STATUS - 0 1 2 3 - -------------------------------------------------- - - MessageType 2 MESSAGE_NODE_REGISRATION_STATUS - 0 1 2 3 - -------------------------------------------------- - |Time-Stamp | - -------------------------------------------------- - |Memory Size (MB) | - -------------------------------------------------- - |Temporary Disk Space (MB) | - -------------------------------------------------- - - MessageType 3 REQUEST_RESOURCE_ALLOCATION - MessageType 22 REQUEST_SUBMIT_BATCH_JOB - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - |Number of Nodes | - -------------------------------------------------- - |Number of Tasks | - -------------------------------------------------- - |CPUs per Task | - -------------------------------------------------- - |Task Distribution (Block or Cyclic) | - -------------------------------------------------- - |Wait for Response |Test Only | - -------------------------------------------------- - These last two flag fields are invalid and will be ignored for a REQUEST_SUBMIT_BATCH_JOB message - - 0 1 2 3 - -------------------------------------------------- - |Option 1 |Length | - -------------------------------------------------- - |Partition | - -------------------------------------------------- - - 0 1 2 3 - -------------------------------------------------- - |Option 2 |Length | - -------------------------------------------------- - |Time Limit | - -------------------------------------------------- - - 0 1 2 3 - -------------------------------------------------- - |Option 3 |Length | - -------------------------------------------------- - |Constraints | - -------------------------------------------------- - - 0 1 2 3 - -------------------------------------------------- - |Option 4 |Length | - -------------------------------------------------- - |Features | - -------------------------------------------------- - - 0 1 2 3 - -------------------------------------------------- - |Option 5 |Length | - -------------------------------------------------- - |IO Location ie stdin, stdout, stderr | - -------------------------------------------------- - - 0 1 2 3 - -------------------------------------------------- - |Option 6 |Length | - -------------------------------------------------- - |Signal Handling Destination | - -------------------------------------------------- - - MessageType 4 RESPONSE_RESOURCE_ALLOCATION - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Node List Nodes/CPU | - |TBD | - -------------------------------------------------- - - MessageType 23 RESPONSE_SUBMIT_BATCH_JOB - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - - - MessageType 5 REQUEST_CANCEL_JOB - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - - MessageType 24 REQUEST_CANCEL_JOB_STEP - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - - MessageType 26 REQUEST_SIGNAL_JOB - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Signal Number | - -------------------------------------------------- - - MessageType 28 REQUEST_SIGNAL_JOB_STEP - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |Signal Number | - -------------------------------------------------- - 0 1 2 3 - -------------------------------------------------- - |Option 1 | - -------------------------------------------------- - |Task ID | - -------------------------------------------------- - - MessageType 6 REQUEST_RECONFIGURE - 0 1 2 3 - -------------------------------------------------- - |UserID | - -------------------------------------------------- - - - MessageType 7 RESPONSE_CANCEL_JOB - MessageType 21 RESPONSE_RECONFIGURE - MessageType 25 RESPONSE_CANCEL_JOB - MessageType 27 RESPONSE_SIGNAL_JOB - MessageType 29 RESPONSE_SIGNAL_JOB_STEP - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - Desc: Returns return-code of cancel job and reconfigure commands - - MessageType 8 REQUEST_INFO_* - 0 1 2 3 - -------------------------------------------------- - |Time-Stamp | - -------------------------------------------------- - - MessageType 9 RESPONSE_INFO_* - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - |Time-Stamp | - -------------------------------------------------- - |Record Count | - -------------------------------------------------- - |Array of Records | - | | - -------------------------------------------------- - - MessageType 39 REQUEST_JOB_ATTACH - 0 1 2 3 - -------------------------------------------------- - |User ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |stdin/stdout Port | - -------------------------------------------------- - |signals/stderr Port | - -------------------------------------------------- - - MessageType 40 RESPONSE_JOB_ATTACH - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - - MessageType 34 REQUEST_LAUNCH_TASKS - 0 1 2 3 - -------------------------------------------------- - |User ID | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |Credential | - -------------------------------------------------- - |Number of Tasks | - -------------------------------------------------- - |Environment | - -------------------------------------------------- - |Current Working Directory | - -------------------------------------------------- - |Command Line | - -------------------------------------------------- - |stdin location | - -------------------------------------------------- - |stdout location | - -------------------------------------------------- - |stderr location | - -------------------------------------------------- - |Task Completion Reporting Port | - -------------------------------------------------- - - MessageType 41 RESPONSE_LAUNCH_TASKS - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - - MessageType 44 REQUEST_GET_JOB_STEP_INFO - 0 1 2 3 - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |User ID | - -------------------------------------------------- - - MessageType 45 RESPONSE_GET_JOB_STEP_INFO - 0 1 2 3 - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |elan contxt | - -------------------------------------------------- - - MessageType 46 REQUEST_JOB_RESOURCE - 0 1 2 3 - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - - MessageType 47 RESPONSE_JOB_RESOURCE - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - |Credentials | - -------------------------------------------------- - |Node List Nodes/CPU | - |TBD | - -------------------------------------------------- - - MessageType 48 REQUEST_RUN_JOB_STEP - 0 1 2 3 - -------------------------------------------------- - ?? - -------------------------------------------------- - - MessageType 49 RESPONSE_RUN_JOB_STEP - 0 1 2 3 - -------------------------------------------------- - ?? - -------------------------------------------------- - - - MessageType 42 REQUEST_GET_KEY - 0 1 2 3 - -------------------------------------------------- - |User ID (must be root) | - -------------------------------------------------- - - MessageType 43 RESPONSE_GET_KEY - 0 1 2 3 - -------------------------------------------------- - |Key | - -------------------------------------------------- - - MessageType 32 MESSAGE_TASK_EXIT - 0 1 2 3 - -------------------------------------------------- - |Return Code | - -------------------------------------------------- - |Job ID | - -------------------------------------------------- - |Job Step ID | - -------------------------------------------------- - |Task ID | - -------------------------------------------------- - - MessageType 20 MESSAGE_UPLOAD_ACCOUNTING_INFO - 0 1 2 3 - -------------------------------------------------- - |Time-Stamp | - -------------------------------------------------- - |Record Count | - -------------------------------------------------- - |Array of Accounting Records | - | | - -------------------------------------------------- - - Accounting Record - -------------------------------------------------- - |User ID | - -------------------------------------------------- - |Memory Used (MB) | - -------------------------------------------------- - |CPU Time Used (Sec) | - -------------------------------------------------- - |CPU Time Allocated (Sec) | - -------------------------------------------------- - -Section 3 SLURM Programmers API: - 3.1 High Level API - MessageFunctions - Registration Functions - request_node_registration ( IN address ) - slurmctld -> slurmd - send_node_registration ( IN address , IN node_registration_msg_struct ) - slurmd -> slurmctld - - ResourceAllocation - request_resource_allocation ( IN job_spec_struct ) - utility -> slurmctld - send_resource_allocation_acknowledgement ( IN resource_allocation_mesg_struct ) - slurmctld -> utility - - Information Functions - request_job_info ( IN timestamp ) - utility -> slurmctld - send_job_info ( IN job_info_mesg_struct ) - slurmctld -> utility - request_node_info ( IN timestamp ) - utility -> slurmctld - send_node_info ( IN node_info_mesg_struct ) - slurmctld -> utility - request_job_step_info ( IN timestamp ) - utility -> slurmctld - send_job_step_info ( IN job_step_info_mesg_struct ) - slurmctld -> utility - request_partition_info ( IN timestamp ) - utility -> slurmctld - send_partition_info ( IN partitiion_infor_mesg_struct ) - slurmctld -> utility - request_accounting_info ( IN timestamp ) - utility -> slurmctld - send_accounting_info ( IN accounting_info_mesg_struct ) - slurmctld -> utility - request_build_info ( IN timestamp ) - utility -> slurmctld - send_build_info ( IN build_info_mesg_struct ) - slurmctld -> utility - - Cancel Job - request_cancel_job ( cancel_job_mesg_struct ) - utility -> slurmctld - send_cancel_job_confirmation ( cancel_job_confirmation_mesg_struct ) - slurmctld -> utility - - Reconfigure - request_reconfigure ( reconfigure_msg_struct ) - utility -> slurmctld - send_reconfigure_confirmation ( reconfigure_confirmation_message_struct ) - slurmctld -> utility - - Upload Accouting Information - send_accounting_info ( accouting_update_mesg_struct ) - - get_message ( OUT address , OUT abstract_mesg_struct ) - - - GeneralFunctions - 3.2 Additional Low Level API - slurm_init_message_engine ( IN slurm_address ) - slurm_shutdown_message_engine ( IN slurm_address ) - slurm_send_server_message ( IN mesg_type , IN buffer , IN buffer_len , IN message ) - This call attempts to connect to the primary slurmctld and upon failure falls back to the backup slurmctld. - send_node_message ( IN address , IN mesg_type , IN buffer , IN buffer_len , IN message ) - recv_message ( OUT address , OUT mesg_type , OUT buffer ,IN/OUTbuffer_len ) - - file_descriptor slurm_listen_stream ( IN port ) - file_descriptor slurm_accept_stream ( IN file_descriptor , IN/OUT slurm_address ) - file_descriptor slurm_connect_stream ( IN slurm_address ) - return_code slurm_close_stream ( IN file_descriptor ) - size slurm_send ( IN file_descriptor , IN buffer , IN length , IN flags ) - size slurm_recv ( IN file_descriptor , IN/OUT buffer , IN/OUT length , IN flags ) - -Section 4 Lower Level Interface (LLI) API: - Almost an exact replicate of the mongo/socket function prototypes that will wrap either sockets or -mongo based on a build configuration option. -- GitLab