diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 82e0780a7b448060eb61d7a15a43fc50fcdc67b4..717ae14e22d2938299140d15f4bed17a32317e52 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -42,6 +42,7 @@ #include <src/common/slurm_protocol_api.h> #include <src/common/macros.h> #include <src/common/xstring.h> +#include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> #define BUF_SIZE 1024 @@ -88,6 +89,7 @@ main (int argc, char *argv[]) log_init(argv[0], log_opts, SYSLOG_FACILITY_DAEMON, NULL); init_ctld_conf ( &slurmctld_conf ); + init_locks ( ); parse_commandline ( argc, argv, &slurmctld_conf ); if ( ( error_code = read_slurm_conf ()) ) @@ -806,6 +808,10 @@ init_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) void fill_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) { + /* Locks: Read config */ + slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + + lock_slurmctld (config_read_lock); conf_ptr->last_update = slurmctld_conf.last_update ; conf_ptr->backup_controller = slurmctld_conf.backup_controller ; conf_ptr->control_machine = slurmctld_conf.control_machine ; @@ -823,6 +829,8 @@ fill_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) conf_ptr->slurm_conf = slurmctld_conf.slurm_conf ; conf_ptr->state_save_location = slurmctld_conf.state_save_location ; conf_ptr->tmp_fs = slurmctld_conf.tmp_fs ; + + unlock_slurmctld (config_read_lock); } /* Variables for commandline passing using getopt */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index c86881ac0c049ad89ba288e852faaf06fbe59f1b..cd00e232591c0489a0e0ae478da505d9aa55e972 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -45,6 +45,7 @@ #include <src/common/pack.h> #include <src/common/slurm_protocol_errno.h> #include <src/common/xstring.h> +#include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> #define BUF_SIZE 1024 @@ -60,7 +61,6 @@ int job_count; /* job's in the system */ List job_list = NULL; /* job_record list */ time_t last_job_update; /* time of last update to job records */ -static pthread_mutex_t job_mutex = PTHREAD_MUTEX_INITIALIZER; /* lock for job info */ static struct job_record *job_hash[MAX_JOB_COUNT]; static struct job_record *job_hash_over[MAX_JOB_COUNT]; static int max_hash_over = 0; @@ -420,10 +420,15 @@ job_allocate (job_desc_msg_t *job_specs, uint32_t *new_job_id, char **node_list { int error_code; struct job_record *job_ptr; + /* Locks: Write job, write node, read partition */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; + lock_slurmctld (job_write_lock); error_code = job_create (job_specs, new_job_id, allocate, will_run, &job_ptr); - if (error_code || will_run || (allocate == 0)) + if (error_code || will_run || (allocate == 0)) { + unlock_slurmctld (job_write_lock); return error_code; + } if (job_ptr == NULL) fatal ("job_allocate: allocated job %u lacks record", new_job_id); @@ -437,6 +442,7 @@ job_allocate (job_desc_msg_t *job_specs, uint32_t *new_job_id, char **node_list if (immediate && top_priority(job_ptr) != 1) { job_ptr->job_state = JOB_FAILED; job_ptr->end_time = 0; + unlock_slurmctld (job_write_lock); return ESLURM_NOT_TOP_PRIORITY; } @@ -445,15 +451,17 @@ job_allocate (job_desc_msg_t *job_specs, uint32_t *new_job_id, char **node_list if (immediate) { job_ptr->job_state = JOB_FAILED; job_ptr->end_time = 0; - return ESLURM_NODES_BUSY; } else /* job remains queued */ - return 0; + error_code = 0; + unlock_slurmctld (job_write_lock); + return error_code; } if (error_code) { /* fundamental flaw in job request */ job_ptr->job_state = JOB_FAILED; job_ptr->end_time = 0; + unlock_slurmctld (job_write_lock); return error_code; } @@ -466,6 +474,7 @@ job_allocate (job_desc_msg_t *job_specs, uint32_t *new_job_id, char **node_list *num_cpu_groups = job_ptr->num_cpu_groups; cpus_per_node[0] = job_ptr->cpus_per_node; cpu_count_reps[0] = job_ptr->cpu_count_reps; + unlock_slurmctld (job_write_lock); return 0; } @@ -481,23 +490,31 @@ int job_cancel (uint32_t job_id) { struct job_record *job_ptr; + /* Locks: Write job, write node */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + + lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { + unlock_slurmctld (job_write_lock); info ("job_cancel: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) + (job_ptr->job_state == JOB_TIMEOUT)) { + unlock_slurmctld (job_write_lock); return ESLURM_ALREADY_DONE; + } if (job_ptr->job_state == JOB_PENDING) { last_job_update = time (NULL); job_ptr->job_state = JOB_FAILED; job_ptr->start_time = job_ptr->end_time = time(NULL); delete_job_details(job_ptr); + unlock_slurmctld (job_write_lock); verbose ("job_cancel of pending job %u successful", job_id); return 0; } @@ -508,14 +525,15 @@ job_cancel (uint32_t job_id) job_ptr->end_time = time(NULL); deallocate_nodes (job_ptr->node_bitmap); delete_job_details(job_ptr); + unlock_slurmctld (job_write_lock); verbose ("job_cancel of running job %u successful", job_id); return 0; } verbose ("job_cancel: job %u can't be cancelled from state=%s", job_id, job_state_string(job_ptr->job_state)); + unlock_slurmctld (job_write_lock); return ESLURM_TRANSITION_STATE_NO_UPDATE; - } /* @@ -922,21 +940,28 @@ job_step_cancel (uint32_t job_id, uint32_t step_id) { struct job_record *job_ptr; int error_code; + /* Locks: Write job */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; + lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { + unlock_slurmctld (job_write_lock); info ("job_step_cancel: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) + (job_ptr->job_state == JOB_TIMEOUT)) { + unlock_slurmctld (job_write_lock); return ESLURM_ALREADY_DONE; + } if (job_ptr->job_state == JOB_STAGE_IN) { last_job_update = time (NULL); error_code = delete_step_record (job_ptr, step_id); + unlock_slurmctld (job_write_lock); if (error_code == ENOENT) { info ("job_step_cancel step %u.%u not found", job_id, step_id); return ESLURM_ALREADY_DONE; @@ -947,6 +972,7 @@ job_step_cancel (uint32_t job_id, uint32_t step_id) info ("job_step_cancel: step %u.%u can't be cancelled from state=%s", job_id, step_id, job_state_string(job_ptr->job_state)); + unlock_slurmctld (job_write_lock); return ESLURM_TRANSITION_STATE_NO_UPDATE; } @@ -1000,32 +1026,6 @@ validate_job_desc ( job_desc_msg_t * job_desc_msg , int allocate ) return SLURM_SUCCESS ; } -/* job_lock - lock the job information - * global: job_mutex - semaphore for the job table - */ - void -job_lock () -{ - int error_code; - error_code = pthread_mutex_lock (&job_mutex); - if (error_code) - fatal ("job_lock: pthread_mutex_lock error %d", error_code); - -} - - -/* job_unlock - unlock the job information - * global: part_mutex - semaphore for the job table - */ - void -job_unlock () -{ - int error_code; - error_code = pthread_mutex_unlock (&job_mutex); - if (error_code) - fatal ("job_unlock: pthread_mutex_unlock error %d", error_code); -} - /* * list_delete_job - delete a job record and its corresponding job_details, * see common/list.h for documentation @@ -1136,12 +1136,16 @@ pack_all_jobs (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; uint32_t jobs_packed ; + /* Locks: Read job */ + slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_job_update) return; + lock_slurmctld (job_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -1174,6 +1178,7 @@ pack_all_jobs (char **buffer_ptr, int *buffer_size, time_t * update_time) jobs_packed ++ ; } + unlock_slurmctld (job_read_lock); list_iterator_destroy (job_record_iterator); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -1221,8 +1226,7 @@ pack_job (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_len) packstr (dump_job_ptr->partition, buf_ptr, buf_len); packstr (dump_job_ptr->name, buf_ptr, buf_len); if (dump_job_ptr->node_bitmap) { - (void) bit_fmt(tmp_str, MAX_STR_PACK, - dump_job_ptr->node_bitmap); + (void) bit_fmt(tmp_str, MAX_STR_PACK, dump_job_ptr->node_bitmap); packstr (tmp_str, buf_ptr, buf_len); } else @@ -1242,8 +1246,8 @@ pack_job (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_len) pack32 ((uint32_t) detail_ptr->min_memory, buf_ptr, buf_len); pack32 ((uint32_t) detail_ptr->min_tmp_disk, buf_ptr, buf_len); - if (detail_ptr->req_nodes == NULL || - strlen (detail_ptr->req_nodes) < MAX_STR_PACK) + if ((detail_ptr->req_nodes == NULL) || + (strlen (detail_ptr->req_nodes) < MAX_STR_PACK)) packstr (detail_ptr->req_nodes, buf_ptr, buf_len); else { strncpy(tmp_str, detail_ptr->req_nodes, MAX_STR_PACK); @@ -1252,8 +1256,7 @@ pack_job (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_len) } if (detail_ptr->req_node_bitmap) { - (void) bit_fmt(tmp_str, MAX_STR_PACK, - detail_ptr->req_node_bitmap); + (void) bit_fmt(tmp_str, MAX_STR_PACK, detail_ptr->req_node_bitmap); packstr (tmp_str, buf_ptr, buf_len); } else @@ -1452,9 +1455,14 @@ update_job (job_desc_msg_t * job_specs) struct job_details *detail_ptr; struct part_record *tmp_part_ptr; bitstr_t *req_bitmap = NULL ; + /* Locks: Write job, read node, read partition */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; + + lock_slurmctld (job_write_lock); job_ptr = find_job_record (job_specs -> job_id); if (job_ptr == NULL) { + unlock_slurmctld (job_write_lock); error ("update_job: job_id %u does not exist.", job_specs -> job_id); return ESLURM_INVALID_JOB_ID; } @@ -1533,8 +1541,10 @@ update_job (job_desc_msg_t * job_specs) if (job_specs -> partition) { tmp_part_ptr = find_part_record (job_specs -> partition); - if (tmp_part_ptr == NULL) + if (tmp_part_ptr == NULL) { + unlock_slurmctld (job_write_lock); return ESLURM_INVALID_PARTITION_NAME; + } strncpy(job_ptr -> partition, job_specs -> partition, MAX_NAME_LEN); job_ptr -> part_ptr = tmp_part_ptr; info ("update_job: setting partition to %s for job_id %u", @@ -1545,6 +1555,7 @@ update_job (job_desc_msg_t * job_specs) if (job_specs -> req_nodes && detail_ptr) { error_code = node_name2bitmap (job_specs->req_nodes, &req_bitmap); if (error_code == EINVAL) { + unlock_slurmctld (job_write_lock); if ( req_bitmap ) bit_free (req_bitmap); return ESLURM_INVALID_NODE_NAME; @@ -1561,5 +1572,6 @@ update_job (job_desc_msg_t * job_specs) job_specs -> req_nodes = NULL; } + unlock_slurmctld (job_write_lock); return SLURM_PROTOCOL_SUCCESS; } diff --git a/src/slurmctld/locks.c b/src/slurmctld/locks.c index 257c6bc0a81bfdb252fe114cdd9267a0c1bef29e..6d9b82cf9ba0aebd71f1f1fa8fbd6513fe2665eb 100644 --- a/src/slurmctld/locks.c +++ b/src/slurmctld/locks.c @@ -66,7 +66,7 @@ void init_locks ( ) { if (sem_id == -1) - sem_id = semget ( IPC_PRIVATE, (COUNT_OF_LOCKS * 3), IPC_CREAT ); + sem_id = semget ( IPC_PRIVATE, (COUNT_OF_LOCKS * 3), IPC_CREAT | 0600 ); if (sem_id < 0) fatal ("semget errno %d", errno); @@ -103,7 +103,7 @@ lock_slurmctld (slurmctld_lock_t lock_levels) wr_wrlock (PART_LOCK); } -/* Issue the required unlock requests in a well defined order */ +/* unlock_slurmctld - Issue the required unlock requests in a well defined order */ void unlock_slurmctld (slurmctld_lock_t lock_levels) { @@ -128,6 +128,7 @@ unlock_slurmctld (slurmctld_lock_t lock_levels) wr_wrunlock (CONFIG_LOCK); } +/* wr_rdlock - Issue a read lock on the specified data type */ void wr_rdlock (lock_datatype_t datatype) { @@ -145,6 +146,7 @@ wr_rdlock (lock_datatype_t datatype) fatal ("semop errno %d", errno); } +/* wr_rdunlock - Issue a read unlock on the specified data type */ void wr_rdunlock (lock_datatype_t datatype) { @@ -154,10 +156,11 @@ wr_rdunlock (lock_datatype_t datatype) rdunlock_op[0] . sem_num = read_lock (datatype); - if (semop (sem_id, rdunlock_op, 3) == -1) + if (semop (sem_id, rdunlock_op, 1) == -1) fatal ("semop errno %d", errno); } +/* wr_wrlock - Issue a write lock on the specified data type */ void wr_wrlock (lock_datatype_t datatype) { @@ -179,13 +182,14 @@ wr_wrlock (lock_datatype_t datatype) wrlock_op[2] . sem_num = write_wait_lock (datatype); wrlock_op[3] . sem_num = write_lock (datatype); - if (semop (sem_id, waitlock_op, 3) == -1) + if (semop (sem_id, waitlock_op, 1) == -1) fatal ("semop errno %d", errno); - if (semop (sem_id, wrlock_op, 3) == -1) + if (semop (sem_id, wrlock_op, 4) == -1) fatal ("semop errno %d", errno); } +/* wr_wrunlock - Issue a write unlock on the specified data type */ void wr_wrunlock (lock_datatype_t datatype) { diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 6d3b7c222c010c0d629e29a2c1472117244eb49a..8f50f2590a7897a56c3a70092e8cc5a50ca44afc 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -37,6 +37,7 @@ #include <string.h> #include <src/common/hostlist.h> +#include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> #define BUF_SIZE 1024 @@ -48,15 +49,14 @@ struct config_record default_config_record; struct node_record default_node_record; time_t last_bitmap_update = (time_t) NULL; /* time of last node creation or deletion */ time_t last_node_update = (time_t) NULL; /* time of last update to node records */ -pthread_mutex_t node_mutex = PTHREAD_MUTEX_INITIALIZER; /* lock for node and config info */ bitstr_t *up_node_bitmap = NULL; /* bitmap of nodes are up */ bitstr_t *idle_node_bitmap = NULL; /* bitmap of nodes are idle */ -int delete_config_record (); -void dump_hash (); -int hash_index (char *name); -void split_node_name (char *name, char *prefix, char *suffix, int *index, int *digits); +int delete_config_record (); +void dump_hash (); +int hash_index (char *name); +void split_node_name (char *name, char *prefix, char *suffix, int *index, int *digits); #if DEBUG_MODULE /* main is used here for testing purposes only */ @@ -716,20 +716,6 @@ list_find_config (void *config_entry, void *key) } -/* node_lock - lock the node and configuration information - * global: node_mutex - semaphore for the global node information - */ -void -node_lock () -{ - int error_code; - error_code = pthread_mutex_lock (&node_mutex); - if (error_code) - fatal ("node_lock: pthread_mutex_lock error %d", error_code); - -} - - /* * node_name2bitmap - given a node name regular expression, build a bitmap representation * input: node_names - list of nodes @@ -786,19 +772,6 @@ node_name2bitmap (char *node_names, bitstr_t **bitmap) } -/* node_unlock - unlock the node and configuration information - * global: node_mutex - semaphore for the global node information - */ -void -node_unlock () -{ - int error_code; - error_code = pthread_mutex_unlock (&node_mutex); - if (error_code) - fatal ("node_unlock: pthread_mutex_unlock error %d", error_code); -} - - /* * pack_all_node - dump all configuration and node information for all nodes in * machine independent form (for network transmission) @@ -822,12 +795,15 @@ pack_all_node (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; int nodes_packed; + /* Locks: Read node */ + slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_node_update) return; + lock_slurmctld (node_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -858,6 +834,7 @@ pack_all_node (char **buffer_ptr, int *buffer_size, time_t * update_time) nodes_packed ++ ; } + unlock_slurmctld (node_read_lock); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -990,6 +967,9 @@ update_node ( update_node_msg_t * update_node_msg ) char *this_node_name ; struct node_record *node_record_point; hostlist_t host_list; + /* Locks: Write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; + if (update_node_msg -> node_names == NULL ) { error ("update_node: invalid node name %s\n", update_node_msg -> node_names ); @@ -1003,6 +983,7 @@ update_node ( update_node_msg_t * update_node_msg ) return ESLURM_INVALID_NODE_NAME; } + lock_slurmctld (node_write_lock); last_node_update = time (NULL); while ( (this_node_name = hostlist_shift (host_list)) ) { node_record_point = find_node_record (this_node_name); @@ -1039,6 +1020,7 @@ update_node ( update_node_msg_t * update_node_msg ) free (this_node_name); } + unlock_slurmctld (node_write_lock); hostlist_destroy (host_list); return error_code; } @@ -1060,9 +1042,13 @@ validate_node_specs (char *node_name, uint32_t cpus, int error_code; struct config_record *config_ptr; struct node_record *node_ptr; + /* Locks: Write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; + lock_slurmctld (node_write_lock); node_ptr = find_node_record (node_name); if (node_ptr == NULL) { + unlock_slurmctld (node_write_lock); return ENOENT; } node_ptr->last_response = time (NULL); @@ -1106,5 +1092,6 @@ validate_node_specs (char *node_name, uint32_t cpus, bit_set (up_node_bitmap, (node_ptr - node_record_table_ptr)); } + unlock_slurmctld (node_write_lock); return error_code; } diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index a08ae8f6c0520bcfe20d2f30da2c65b42fb24085..c3d79cccabe2e11570192a877c3a7a2a3c8a509d 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -38,6 +38,7 @@ #include <src/common/hostlist.h> #include <src/common/list.h> +#include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> #define BUF_SIZE 1024 @@ -47,11 +48,10 @@ List part_list = NULL; /* partition list */ char default_part_name[MAX_NAME_LEN]; /* name of default partition */ struct part_record *default_part_loc = NULL; /* location of default partition */ time_t last_part_update; /* time of last update to partition records */ -static pthread_mutex_t part_mutex = PTHREAD_MUTEX_INITIALIZER; /* lock for partition info */ -int build_part_bitmap (struct part_record *part_record_point); -void list_delete_part (void *part_entry); -int list_find_part (void *part_entry, void *key); +int build_part_bitmap (struct part_record *part_record_point); +void list_delete_part (void *part_entry); +int list_find_part (void *part_entry, void *key); #if DEBUG_MODULE /* main is used here for module testing purposes only */ @@ -447,12 +447,15 @@ pack_all_part (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; int parts_packed; + /* Locks: Read partition */ + slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_part_update) return; + lock_slurmctld (part_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -486,6 +489,7 @@ pack_all_part (char **buffer_ptr, int *buffer_size, time_t * update_time) } list_iterator_destroy (part_record_iterator); + unlock_slurmctld (part_read_lock); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -545,33 +549,6 @@ pack_part (struct part_record *part_record_point, void **buf_ptr, int *buf_len) } -/* part_lock - lock the partition information - * global: part_mutex - semaphore for the partition table - */ -void -part_lock () -{ - int error_code; - error_code = pthread_mutex_lock (&part_mutex); - if (error_code) - fatal ("part_lock: pthread_mutex_lock error %d", error_code); - -} - - -/* part_unlock - unlock the partition information - * global: part_mutex - semaphore for the partition table - */ -void -part_unlock () -{ - int error_code; - error_code = pthread_mutex_unlock (&part_mutex); - if (error_code) - fatal ("part_unlock: pthread_mutex_unlock error %d", error_code); -} - - /* * update_part - update a partition's configuration data * global: part_list - list of partition entries @@ -582,6 +559,8 @@ update_part (update_part_msg_t * part_desc ) { int error_code, i; struct part_record *part_ptr; + /* Locks: Read node, write partition */ + slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK }; if ((part_desc -> name == NULL ) || (strlen (part_desc->name ) >= MAX_NAME_LEN)) { @@ -590,6 +569,7 @@ update_part (update_part_msg_t * part_desc ) } error_code = 0; + lock_slurmctld (part_write_lock); part_ptr = list_find_first (part_list, &list_find_part, part_desc->name); if (part_ptr == NULL) { @@ -667,7 +647,8 @@ update_part (update_part_msg_t * part_desc ) if (backup_node_list) xfree(backup_node_list); } - return error_code; - } + } + + unlock_slurmctld (part_write_lock); return error_code; } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 5ec1e1412c3c2eabab986cf09cc658bbe826bcc4..17a9839a2d036c2356b0f69dc04c8598255d9dc0 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -40,14 +40,15 @@ #include <src/common/list.h> #include <src/common/macros.h> #include <src/common/parse_spec.h> +#include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> #define BUF_SIZE 1024 -int init_slurm_conf (); -int parse_config_spec (char *in_line); -int parse_node_spec (char *in_line); -int parse_part_spec (char *in_line); +int init_slurm_conf (); +int parse_config_spec (char *in_line); +int parse_node_spec (char *in_line); +int parse_part_spec (char *in_line); static char highest_node_name[MAX_NAME_LEN] = ""; int node_record_count = 0; @@ -811,11 +812,16 @@ read_slurm_conf ( ) { int line_num; /* line number in input file */ char in_line[BUF_SIZE]; /* input line */ int i, j, error_code; + /* Locks: Write configuration, write job, write node, write partition */ + slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; /* initialization */ + lock_slurmctld (config_write_lock); start_time = clock (); - if ( (error_code = init_slurm_conf ()) ) + if ( (error_code = init_slurm_conf ()) ) { + unlock_slurmctld (config_write_lock); return error_code; + } slurm_spec_file = fopen (slurmctld_conf.slurm_conf, "r"); if (slurm_spec_file == NULL) @@ -832,6 +838,7 @@ read_slurm_conf ( ) { error ("read_slurm_conf line %d, of input file %s too long\n", line_num, slurmctld_conf.slurm_conf); fclose (slurm_spec_file); + unlock_slurmctld (config_write_lock); return E2BIG; break; } @@ -858,18 +865,21 @@ read_slurm_conf ( ) { /* overall configuration parameters */ if ((error_code = parse_config_spec (in_line))) { fclose (slurm_spec_file); + unlock_slurmctld (config_write_lock); return error_code; } /* node configuration parameters */ if ((error_code = parse_node_spec (in_line))) { fclose (slurm_spec_file); + unlock_slurmctld (config_write_lock); return error_code; } /* partition configuration parameters */ if ((error_code = parse_part_spec (in_line))) { fclose (slurm_spec_file); + unlock_slurmctld (config_write_lock); return error_code; } @@ -884,27 +894,33 @@ read_slurm_conf ( ) { if (slurmctld_conf.control_machine == NULL) { fatal ("read_slurm_conf: control_machine value not specified."); + unlock_slurmctld (config_write_lock); return EINVAL; } if (default_part_loc == NULL) { error ("read_slurm_conf: default partition not set."); + unlock_slurmctld (config_write_lock); return EINVAL; } if (node_record_count < 1) { error ("read_slurm_conf: no nodes configured."); + unlock_slurmctld (config_write_lock); return EINVAL; } rehash (); - if ((error_code = build_bitmaps ())) + if ((error_code = build_bitmaps ())) { + unlock_slurmctld (config_write_lock); return error_code; + } list_sort (config_list, &list_compare_config); slurmctld_conf.last_update = time (NULL) ; info ("read_slurm_conf: finished loading configuration, time=%ld", (long) (clock () - start_time)); + unlock_slurmctld (config_write_lock); return SLURM_SUCCESS; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index dc414a33eaecb7f4f44718fa26ef4a05b1e99f40..22e25cff0560b628eec0f6ee6458b56fc758166f 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -302,12 +302,6 @@ extern int job_step_cancel (uint32_t job_id, uint32_t job_step_id); extern int job_create (job_desc_msg_t * job_specs, uint32_t *new_job_id, int allocate, int will_run, struct job_record **job_rec_ptr); -/* job_lock - lock the job information */ -extern void job_lock (); - -/* job_unlock - unlock the job information */ -extern void job_unlock (); - /* list_compare_config - compare two entry from the config list based upon weight */ extern int list_compare_config (void *config_entry1, void *config_entry2); @@ -330,12 +324,6 @@ extern int match_feature (char *seek, char *available); /* match_group - determine if the user is a member of any groups permitted to use this partition */ extern int match_group (char *allow_groups, char *user_groups); -/* node_lock - lock the node and configuration information */ -extern void node_lock (); - -/* node_unlock - unlock the node and configuration information */ -extern void node_unlock (); - /* node_name2bitmap - given a node name regular expression, build a bitmap representation */ extern int node_name2bitmap (char *node_names, bitstr_t **bitmap); @@ -390,12 +378,6 @@ extern void pack_part (struct part_record *part_record_point, void **buf_ptr, in */ extern void pack_step (struct step_record *dump_step_ptr, void **buf_ptr, int *buf_len); -/* part_lock - lock the partition information */ -extern void part_lock (); - -/* part_unlock - unlock the partition information */ -extern void part_unlock (); - /* * purge_old_job - purge old job records. if memory space is needed. * the jobs must have completed at least MIN_JOB_AGE minutes ago