From b484bbc1b58ba833fe26e2e61971fe23bcc7d673 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 10 Sep 2002 20:53:22 +0000 Subject: [PATCH] Moved semaphores (mostly) up to controller.c. Added semaphores for some job_step functions that were lacking them. --- src/slurmctld/controller.c | 135 +++++++++++++++++++++++++++------ src/slurmctld/job_mgr.c | 97 ++++++----------------- src/slurmctld/job_scheduler.c | 2 +- src/slurmctld/node_mgr.c | 33 +++----- src/slurmctld/node_scheduler.c | 6 +- src/slurmctld/partition_mgr.c | 10 +-- src/slurmctld/read_config.c | 16 +--- src/slurmctld/step_mgr.c | 23 +----- 8 files changed, 153 insertions(+), 169 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index af200471bd0..796ca6a11e2 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -218,6 +218,8 @@ slurmctld_signal_hand ( void * no_data ) int sig ; int error_code; sigset_t set; + /* Locks: Write configuration, write job, write node, write partition */ + slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; (void) pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL); @@ -246,7 +248,11 @@ slurmctld_signal_hand ( void * no_data ) break; case SIGHUP: /* kill -1 */ info ("Reconfigure signal (SIGHUP) received\n"); + lock_slurmctld (config_write_lock); error_code = read_slurm_conf (0); + if (error_code == 0) + reset_job_bitmaps (); + unlock_slurmctld (config_write_lock); if (error_code) error ("read_slurm_conf error %d", error_code); break; @@ -600,20 +606,26 @@ slurm_rpc_dump_build ( slurm_msg_t * msg ) slurm_msg_t response_msg ; last_update_msg_t * last_time_msg = ( last_update_msg_t * ) msg-> data ; slurm_ctl_conf_info_msg_t build_tbl ; + /* Locks: Read config */ + slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST BUILD_INFO"); + lock_slurmctld (config_read_lock); /* check to see if configuration data has changed */ if ( last_time_msg -> last_update >= slurmctld_conf.last_update ) { + unlock_slurmctld (config_read_lock); info ("slurm_rpc_dump_build, no change, time=%ld", (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_NO_CHANGE_IN_DATA ); } else { - /* success */ fill_ctld_conf ( & build_tbl ) ; + unlock_slurmctld (config_read_lock); + /* init response_msg structure */ response_msg . address = msg -> address ; response_msg . msg_type = RESPONSE_BUILD_INFO ; @@ -635,19 +647,25 @@ slurm_rpc_dump_jobs ( slurm_msg_t * msg ) slurm_msg_t response_msg ; job_info_request_msg_t * last_time_msg = ( job_info_request_msg_t * ) msg-> data ; time_t last_update = last_time_msg -> last_update ; - + /* Locks: Read job */ + slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + start_time = clock (); + debug ("Processing RPC: REQUEST_JOB_INFO"); + lock_slurmctld (job_read_lock); if ( last_time_msg -> last_update >= last_job_update ) { + unlock_slurmctld (job_read_lock); info ("slurm_rpc_dump_jobs, no change, time=%ld", (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_NO_CHANGE_IN_DATA ); } else { - /* success */ pack_all_jobs (&dump, &dump_size, &last_update); + unlock_slurmctld (job_read_lock); + /* init response_msg structure */ response_msg . address = msg -> address ; response_msg . msg_type = RESPONSE_JOB_INFO ; @@ -673,19 +691,25 @@ slurm_rpc_dump_nodes ( slurm_msg_t * msg ) slurm_msg_t response_msg ; last_update_msg_t * last_time_msg = ( last_update_msg_t * ) msg-> data ; time_t last_update = last_time_msg -> last_update ; + /* Locks: Read node */ + slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_NODE_INFO"); + lock_slurmctld (node_read_lock); if ( last_time_msg -> last_update >= last_node_update ) { + unlock_slurmctld (node_read_lock); info ("slurm_rpc_dump_nodes, no change, time=%ld", (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_NO_CHANGE_IN_DATA ); } else { - /* success */ pack_all_node (&dump, &dump_size, &last_update); + unlock_slurmctld (node_read_lock); + /* init response_msg structure */ response_msg . address = msg -> address ; response_msg . msg_type = RESPONSE_NODE_INFO ; @@ -711,19 +735,25 @@ slurm_rpc_dump_partitions ( slurm_msg_t * msg ) slurm_msg_t response_msg ; last_update_msg_t * last_time_msg = ( last_update_msg_t * ) msg-> data ; time_t last_update = last_time_msg -> last_update ; + /* Locks: Read partition */ + slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_PARTITION_INFO"); + lock_slurmctld (part_read_lock); if ( last_time_msg -> last_update >= last_part_update ) { + unlock_slurmctld (part_read_lock); info ("slurm_rpc_dump_partitions, no change, time=%ld", (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_NO_CHANGE_IN_DATA ); } else { - /* success */ pack_all_part (&dump, &dump_size, &last_update); + unlock_slurmctld (part_read_lock); + /* init response_msg structure */ response_msg . address = msg -> address ; response_msg . msg_type = RESPONSE_PARTITION_INFO ; @@ -747,12 +777,17 @@ slurm_rpc_job_step_cancel ( slurm_msg_t * msg ) int error_code; clock_t start_time; job_step_id_msg_t * job_step_id_msg = ( job_step_id_msg_t * ) msg-> data ; + /* Locks: Write job, write node */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_CANCEL_JOB_STEP"); + lock_slurmctld (job_write_lock); /* do RPC call */ if (job_step_id_msg->job_step_id == NO_VAL) { error_code = job_cancel ( job_step_id_msg->job_id ); + unlock_slurmctld (job_write_lock); /* return result */ if (error_code) @@ -766,6 +801,8 @@ slurm_rpc_job_step_cancel ( slurm_msg_t * msg ) info ("slurm_rpc_job_step_cancel success for JobId=%u, time=%ld", job_step_id_msg->job_id, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + + /* Below functions provide their own locking */ schedule (); (void) dump_all_job_state ( ); @@ -774,6 +811,8 @@ slurm_rpc_job_step_cancel ( slurm_msg_t * msg ) else { error_code = job_step_cancel ( job_step_id_msg->job_id , job_step_id_msg->job_step_id); + unlock_slurmctld (job_write_lock); + /* return result */ if (error_code) { @@ -788,6 +827,8 @@ slurm_rpc_job_step_cancel ( slurm_msg_t * msg ) job_step_id_msg->job_id, job_step_id_msg->job_step_id, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + + /* Below function provides its own locking */ (void) dump_all_job_state ( ); } } @@ -798,16 +839,21 @@ slurm_rpc_job_step_cancel ( slurm_msg_t * msg ) void slurm_rpc_job_step_complete ( slurm_msg_t * msg ) { - /* init */ int error_code; clock_t start_time; job_step_id_msg_t * job_step_id_msg = ( job_step_id_msg_t * ) msg-> data ; + /* Locks: Write job, write node */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + /* init */ start_time = clock (); + debug ("Processing RPC: REQUEST_COMPLETE_JOB_STEP"); + lock_slurmctld (job_write_lock); /* do RPC call */ if (job_step_id_msg->job_step_id == NO_VAL) { error_code = job_complete ( job_step_id_msg->job_id ); + unlock_slurmctld (job_write_lock); /* return result */ if (error_code) @@ -821,13 +867,15 @@ slurm_rpc_job_step_complete ( slurm_msg_t * msg ) info ("slurm_rpc_job_step_complete success for JobId=%u, time=%ld", job_step_id_msg->job_id, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); - schedule (); - (void) dump_all_job_state (); + schedule (); /* Has own locking */ + (void) dump_all_job_state (); /* Has own locking */ } } else { - error_code = job_step_complete ( job_step_id_msg->job_id , + error_code = job_step_complete ( job_step_id_msg->job_id, job_step_id_msg->job_step_id); + unlock_slurmctld (job_write_lock); + /* return result */ if (error_code) { @@ -842,7 +890,7 @@ slurm_rpc_job_step_complete ( slurm_msg_t * msg ) job_step_id_msg->job_id, job_step_id_msg->job_step_id, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); - (void) dump_all_job_state ( ); + (void) dump_all_job_state ( ); /* Has own locking */ } } } @@ -855,18 +903,25 @@ slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) int resp_buffer_size = 0; int error_code = 0; job_step_info_request_msg_t* request = ( job_step_info_request_msg_t * ) msg-> data ; + /* Locks: Read job */ + slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_JOB_STEP_INFO"); + lock_slurmctld (job_read_lock); if ( request -> last_update >= last_job_update ) { + unlock_slurmctld (job_read_lock); info ("slurm_rpc_job_step_get_info, no change, time=%ld", (long) (clock () - start_time)); error_code = SLURM_NO_CHANGE_IN_DATA; } else { - error_code = pack_ctld_job_step_info_reponse_msg (&resp_buffer, &resp_buffer_size, - request->job_id, request->step_id); + error_code = pack_ctld_job_step_info_reponse_msg (&resp_buffer, + &resp_buffer_size, + request->job_id, request->step_id); + unlock_slurmctld (job_read_lock); if (error_code == ESLURM_INVALID_JOB_ID) info ("slurm_rpc_job_step_get_info, no such job step %u.%u, time=%ld", request->job_id, request->step_id, (long) (clock () - start_time)); @@ -899,9 +954,14 @@ slurm_rpc_update_job ( slurm_msg_t * msg ) int error_code; clock_t start_time; job_desc_msg_t * job_desc_msg = ( job_desc_msg_t * ) msg-> data ; + /* Locks: Write job, read node, read partition */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; start_time = clock (); - + debug ("Processing RPC: REQUEST_UPDATE_JOB"); + lock_slurmctld (job_write_lock); + unlock_slurmctld (job_write_lock); + /* do RPC call */ error_code = update_job ( job_desc_msg ); @@ -919,6 +979,7 @@ slurm_rpc_update_job ( slurm_msg_t * msg ) job_desc_msg->job_id, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + /* Below functions provide their own locking */ schedule (); (void) dump_all_job_state (); } @@ -932,12 +993,18 @@ slurm_rpc_update_node ( slurm_msg_t * msg ) int error_code; clock_t start_time; update_node_msg_t * update_node_msg_ptr ; + /* Locks: Write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; + start_time = clock (); - + debug ("Processing RPC: REQUEST_UPDATE_NODE"); + lock_slurmctld (node_write_lock); + update_node_msg_ptr = (update_node_msg_t * ) msg-> data ; /* do RPC call */ error_code = update_node ( update_node_msg_ptr ); + unlock_slurmctld (node_write_lock); /* return result */ if (error_code) @@ -954,6 +1021,8 @@ slurm_rpc_update_node ( slurm_msg_t * msg ) (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); } + + /* Below functions provide their own locks */ if (schedule ()) (void) dump_all_job_state (); (void) dump_all_node_state (); @@ -967,10 +1036,16 @@ slurm_rpc_update_partition ( slurm_msg_t * msg ) int error_code; clock_t start_time; update_part_msg_t * part_desc_ptr = (update_part_msg_t * ) msg-> data ; + /* Locks: Read node, write partition */ + slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK }; + start_time = clock (); + debug ("Processing RPC: REQUEST_UPDATE_PARTITION"); + lock_slurmctld (part_write_lock); /* do RPC call */ error_code = update_part ( part_desc_ptr ); + unlock_slurmctld (part_write_lock); /* return result */ if (error_code) @@ -984,6 +1059,8 @@ slurm_rpc_update_partition ( slurm_msg_t * msg ) info ("slurm_rpc_update_partition complete for partition %s, time=%ld", part_desc_ptr->name, (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + + /* NOTE: These functions provide their own locks */ (void) dump_all_part_state (); if (schedule ()) (void) dump_all_job_state (); @@ -1194,12 +1271,19 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) /* init */ int error_code; clock_t start_time; + /* Locks: Write configuration, write job, write node, write partition */ + slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_RECONFIGURE"); /* must be user root */ /* do RPC call */ + lock_slurmctld (config_write_lock); error_code = read_slurm_conf (0); + if (error_code == 0) + reset_job_bitmaps (); + unlock_slurmctld (config_write_lock); /* return result */ if (error_code) @@ -1210,7 +1294,6 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) } else { - reset_job_bitmaps (); info ("slurm_rpc_reconfigure_controller completed successfully, time=%ld", (long) (clock () - start_time)); slurm_send_rc_msg ( msg , SLURM_SUCCESS ); @@ -1257,16 +1340,21 @@ slurm_rpc_job_step_create( slurm_msg_t* msg ) job_step_create_response_msg_t job_step_resp; job_step_create_request_msg_t * req_step_msg = ( job_step_create_request_msg_t* ) msg-> data ; + /* Locks: Write jobs, read nodes */ + slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: REQUEST_JOB_STEP_CREATE"); /* issue the RPC */ dump_step_desc ( req_step_msg ); + lock_slurmctld (job_write_lock); error_code = step_create ( req_step_msg, &step_rec ); /* return result */ if ( error_code ) { + unlock_slurmctld (job_write_lock); info ("slurm_rpc_job_step_create error %s, time=%ld", slurm_strerror( error_code ), (long) (clock () - start_time)); slurm_send_rc_msg ( msg , error_code ); @@ -1285,12 +1373,13 @@ slurm_rpc_job_step_create( slurm_msg_t* msg ) #ifdef HAVE_LIBELAN3 job_step_resp.qsw_job = step_rec-> qsw_job ; #endif + unlock_slurmctld (job_write_lock); resp. address = msg -> address ; resp. msg_type = RESPONSE_JOB_STEP_CREATE ; resp. data = &job_step_resp ; slurm_send_node_msg ( msg->conn_fd , &resp); - (void) dump_all_job_state ( ); + (void) dump_all_job_state ( ); /* Sets own locks */ } } @@ -1304,17 +1393,20 @@ slurm_rpc_node_registration ( slurm_msg_t * msg ) clock_t start_time; slurm_node_registration_status_msg_t * node_reg_stat_msg = ( slurm_node_registration_status_msg_t * ) msg-> data ; + /* Locks: Write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; start_time = clock (); + debug ("Processing RPC: MESSAGE_NODE_REGISTRATION_STATUS"); + lock_slurmctld (node_write_lock); /* do RPC call */ - /*cpus = real_memory = tmp_disk = NO_VAL; - * this should be done client side now */ error_code = validate_node_specs ( node_reg_stat_msg -> node_name , node_reg_stat_msg -> cpus , node_reg_stat_msg -> real_memory_size , node_reg_stat_msg -> temporary_disk_space ) ; + unlock_slurmctld (node_write_lock); /* return result */ if (error_code) @@ -1412,10 +1504,6 @@ init_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) void fill_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) { - /* Locks: Read config */ - slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; - - lock_slurmctld (config_read_lock); conf_ptr->last_update = slurmctld_conf.last_update ; conf_ptr->backup_controller = slurmctld_conf.backup_controller ; conf_ptr->control_machine = slurmctld_conf.control_machine ; @@ -1433,8 +1521,7 @@ fill_ctld_conf ( slurm_ctl_conf_t * conf_ptr ) conf_ptr->slurm_conf = slurmctld_conf.slurm_conf ; conf_ptr->state_save_location = slurmctld_conf.state_save_location ; conf_ptr->tmp_fs = slurmctld_conf.tmp_fs ; - - unlock_slurmctld (config_read_lock); + return; } /* Variables for commandline passing using getopt */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 84e75a09884..1ca6bd375eb 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -903,7 +903,7 @@ init_job_conf () fatal ("init_job_conf: list_create can not allocate memory"); } last_job_update = time (NULL); - return 0; + return SLURM_SUCCESS; } @@ -1020,7 +1020,7 @@ job_allocate (job_desc_msg_t *job_specs, uint32_t *new_job_id, char **node_list cpu_count_reps[0] = job_ptr->cpu_count_reps; } unlock_slurmctld (job_write_lock); - return 0; + return SLURM_SUCCESS; } @@ -1035,33 +1035,25 @@ int job_cancel (uint32_t job_id) { struct job_record *job_ptr; - /* Locks: Write job, write node */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; - - lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); info ("job_cancel: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) { - unlock_slurmctld (job_write_lock); + (job_ptr->job_state == JOB_TIMEOUT)) return ESLURM_ALREADY_DONE; - } if (job_ptr->job_state == JOB_PENDING) { last_job_update = time (NULL); job_ptr->job_state = JOB_FAILED; job_ptr->start_time = job_ptr->end_time = time(NULL); delete_job_details(job_ptr); - unlock_slurmctld (job_write_lock); verbose ("job_cancel of pending job %u successful", job_id); - return 0; + return SLURM_SUCCESS; } if ((job_ptr->job_state == JOB_STAGE_IN) || @@ -1072,14 +1064,12 @@ job_cancel (uint32_t job_id) job_ptr->end_time = time(NULL); deallocate_nodes (job_ptr); delete_job_details(job_ptr); - unlock_slurmctld (job_write_lock); verbose ("job_cancel of running job %u successful", job_id); - return 0; + return SLURM_SUCCESS; } verbose ("job_cancel: job %u can't be cancelled from state=%s", job_id, job_state_string(job_ptr->job_state)); - unlock_slurmctld (job_write_lock); return ESLURM_TRANSITION_STATE_NO_UPDATE; } @@ -1094,30 +1084,22 @@ int job_complete (uint32_t job_id) { struct job_record *job_ptr; - /* Locks: Write job, write node */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; - lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); info ("job_complete: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) { - unlock_slurmctld (job_write_lock); + (job_ptr->job_state == JOB_TIMEOUT)) return ESLURM_ALREADY_DONE; - } if ((job_ptr->job_state == JOB_STAGE_IN) || (job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state == JOB_STAGE_OUT)) { - unlock_slurmctld (job_write_lock); deallocate_nodes (job_ptr); - lock_slurmctld(job_write_lock); verbose ("job_complete for job id %u successful", job_id); } else { @@ -1128,8 +1110,7 @@ job_complete (uint32_t job_id) job_ptr->job_state = JOB_COMPLETE; job_ptr->end_time = time(NULL); delete_job_details(job_ptr); - unlock_slurmctld (job_write_lock); - return 0; + return SLURM_SUCCESS; } /* @@ -1355,7 +1336,7 @@ mkdir2 (char * path, int modes) (void) chmod (path, modes); } - return 0; + return SLURM_SUCCESS; } /* rmdir2 - Remove a directory, does system call if root, runs rmdir otherwise */ @@ -1379,7 +1360,7 @@ rmdir2 (char * path) return error_code; } - return 0; + return SLURM_SUCCESS; } /* Create file with specified name and write the supplied data array to it */ @@ -1390,7 +1371,7 @@ write_data_array_to_file ( char * file_name, char ** data, uint16_t size ) if (data == NULL) { (void) unlink (file_name); - return 0; + return SLURM_SUCCESS; } fd = creat (file_name, 0600); @@ -1413,7 +1394,7 @@ write_data_array_to_file ( char * file_name, char ** data, uint16_t size ) } close (fd); - return 0; + return SLURM_SUCCESS; } /* Create file with specified name and write the supplied data to it */ @@ -1424,7 +1405,7 @@ write_data_to_file ( char * file_name, char * data ) if (data == NULL) { (void) unlink (file_name); - return 0; + return SLURM_SUCCESS; } fd = creat (file_name, 0600); @@ -1444,7 +1425,7 @@ write_data_to_file ( char * file_name, char * data ) nwrite -= pos; } close (fd); - return 0; + return SLURM_SUCCESS; } /* copy_job_desc_to_job_record - copy the job descriptor from the RPC structure @@ -1525,7 +1506,7 @@ copy_job_desc_to_job_record ( job_desc_msg_t * job_desc , } *job_rec_ptr = job_ptr; - return 0; + return SLURM_SUCCESS; } /* @@ -1540,41 +1521,34 @@ job_step_cancel (uint32_t job_id, uint32_t step_id) { struct job_record *job_ptr; int error_code; - /* Locks: Write job */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; - lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); + info ("job_step_cancel: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) { - unlock_slurmctld (job_write_lock); + (job_ptr->job_state == JOB_TIMEOUT)) return ESLURM_ALREADY_DONE; - } if ((job_ptr->job_state == JOB_STAGE_IN) || (job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state == JOB_STAGE_OUT)) { last_job_update = time (NULL); error_code = delete_step_record (job_ptr, step_id); - unlock_slurmctld (job_write_lock); if (error_code == ENOENT) { info ("job_step_cancel step %u.%u not found", job_id, step_id); return ESLURM_ALREADY_DONE; } - return 0; + return SLURM_SUCCESS; } info ("job_step_cancel: step %u.%u can't be cancelled from state=%s", job_id, step_id, job_state_string(job_ptr->job_state)); - unlock_slurmctld (job_write_lock); return ESLURM_TRANSITION_STATE_NO_UPDATE; } @@ -1591,36 +1565,25 @@ job_step_complete (uint32_t job_id, uint32_t step_id) { struct job_record *job_ptr; int error_code; - /* Locks: Write job */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; - lock_slurmctld (job_write_lock); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); info ("job_step_complete: invalid job id %u", job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_COMPLETE) || - (job_ptr->job_state == JOB_TIMEOUT)) { - unlock_slurmctld (job_write_lock); + (job_ptr->job_state == JOB_TIMEOUT)) return ESLURM_ALREADY_DONE; - } last_job_update = time (NULL); error_code = delete_step_record (job_ptr, step_id); - unlock_slurmctld (job_write_lock); if (error_code == ENOENT) { info ("job_step_complete step %u.%u not found", job_id, step_id); return ESLURM_ALREADY_DONE; } - return 0; - - unlock_slurmctld (job_write_lock); - return ESLURM_TRANSITION_STATE_NO_UPDATE; - + return SLURM_SUCCESS; } /* @@ -1771,7 +1734,7 @@ list_find_job_id (void *job_entry, void *key) { if (((struct job_record *) job_entry)->job_id == *((uint32_t *) key)) return 1; - return 0; + return SLURM_SUCCESS; } @@ -1788,12 +1751,12 @@ list_find_job_old (void *job_entry, void *key) min_age = time(NULL) - MIN_JOB_AGE; if (((struct job_record *) job_entry)->end_time > min_age) - return 0; + return SLURM_SUCCESS; if ((((struct job_record *) job_entry)->job_state != JOB_COMPLETE) && (((struct job_record *) job_entry)->job_state != JOB_FAILED) && (((struct job_record *) job_entry)->job_state != JOB_TIMEOUT)) - return 0; + return SLURM_SUCCESS; return 1; } @@ -1824,16 +1787,12 @@ pack_all_jobs (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; uint32_t jobs_packed ; - /* Locks: Read job */ - slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; - buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_job_update) return; - lock_slurmctld (job_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -1865,7 +1824,6 @@ pack_all_jobs (char **buffer_ptr, int *buffer_size, time_t * update_time) jobs_packed ++ ; } - unlock_slurmctld (job_read_lock); list_iterator_destroy (job_record_iterator); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -2128,14 +2086,9 @@ update_job (job_desc_msg_t * job_specs) struct job_details *detail_ptr; struct part_record *tmp_part_ptr; bitstr_t *req_bitmap = NULL ; - /* Locks: Write job, read node, read partition */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; - - lock_slurmctld (job_write_lock); job_ptr = find_job_record (job_specs -> job_id); if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); error ("update_job: job_id %u does not exist.", job_specs -> job_id); return ESLURM_INVALID_JOB_ID; } @@ -2215,10 +2168,8 @@ update_job (job_desc_msg_t * job_specs) if (job_specs -> partition) { tmp_part_ptr = find_part_record (job_specs -> partition); - if (tmp_part_ptr == NULL) { - unlock_slurmctld (job_write_lock); + if (tmp_part_ptr == NULL) return ESLURM_INVALID_PARTITION_NAME; - } strncpy(job_ptr -> partition, job_specs -> partition, MAX_NAME_LEN); job_ptr -> part_ptr = tmp_part_ptr; info ("update_job: setting partition to %s for job_id %u", @@ -2229,7 +2180,6 @@ update_job (job_desc_msg_t * job_specs) if (job_specs -> req_nodes && detail_ptr) { error_code = node_name2bitmap (job_specs->req_nodes, &req_bitmap); if (error_code == EINVAL) { - unlock_slurmctld (job_write_lock); if ( req_bitmap ) bit_free (req_bitmap); return ESLURM_INVALID_NODE_NAME; @@ -2246,6 +2196,5 @@ update_job (job_desc_msg_t * job_specs) job_specs -> req_nodes = NULL; } - unlock_slurmctld (job_write_lock); return SLURM_PROTOCOL_SUCCESS; } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index fcc254a1813..c3462b579e5 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -115,7 +115,7 @@ schedule (void) job_queue_size = build_job_queue (&job_queue); if (job_queue_size == 0) { unlock_slurmctld (job_write_lock); - return 0; + return SLURM_SUCCESS; } sort_job_queue (job_queue, job_queue_size); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 367d7c2a7af..78ff1ed3196 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -467,7 +467,7 @@ delete_config_record () last_node_update = time (NULL); (void) list_delete_all (config_list, &list_find_config, "universal_key"); - return 0; + return SLURM_SUCCESS; } @@ -476,7 +476,7 @@ delete_config_record () * to avoid invalidating the bitmaps and hash table, we just clear the name * set its state to NODE_STATE_DOWN * input: name - name of the desired node - * output: return 0 on success, errno otherwise + * output: return SLURM_SUCCESS on success, errno otherwise * global: node_record_table_ptr - pointer to global node table */ int @@ -499,7 +499,7 @@ delete_node_record (char *name) strcpy (node_record_point->name, ""); node_record_point->node_state = NODE_STATE_DOWN; last_bitmap_update = time (NULL); - return 0; + return SLURM_SUCCESS; } @@ -549,7 +549,7 @@ dump_all_node_state ( void ) for (inx = 0; inx < node_record_count; inx++) { if ((node_record_table_ptr[inx].magic != NODE_MAGIC) || (node_record_table_ptr[inx].config_ptr->magic != CONFIG_MAGIC)) - fatal ("pack_all_node: data integrity is bad"); + fatal ("dump_all_node_state: data integrity is bad"); dump_node_state(&node_record_table_ptr[inx], &buf_ptr, &buf_len); if (buf_len > BUF_SIZE) @@ -740,7 +740,7 @@ hash_index (char *name) int i, inx, tmp; if (node_record_count == 0) - return 0; /* degenerate case */ + return SLURM_SUCCESS; /* degenerate case */ inx = 0; if ( slurmctld_conf.hash_base == 10 ) { @@ -845,7 +845,7 @@ init_node_conf () if (config_list == NULL) fatal ("init_node_conf: list_create can not allocate memory"); - return 0; + return SLURM_SUCCESS; } @@ -884,7 +884,7 @@ list_find_config (void *config_entry, void *key) { if (strcmp (key, "universal_key") == 0) return 1; - return 0; + return SLURM_SUCCESS; } @@ -940,7 +940,7 @@ node_name2bitmap (char *node_names, bitstr_t **bitmap) hostlist_destroy (host_list); bitmap[0] = my_bitmap; - return 0; + return SLURM_SUCCESS; } @@ -966,15 +966,12 @@ pack_all_node (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; int nodes_packed; - /* Locks: Read node */ - slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_node_update) return; - lock_slurmctld (node_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -1002,7 +999,6 @@ pack_all_node (char **buffer_ptr, int *buffer_size, time_t * update_time) buf_ptr = buffer + buffer_offset; } - unlock_slurmctld (node_read_lock); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -1152,9 +1148,6 @@ update_node ( update_node_msg_t * update_node_msg ) char *this_node_name ; struct node_record *node_record_point; hostlist_t host_list; - /* Locks: Write node */ - slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; - if (update_node_msg -> node_names == NULL ) { error ("update_node: invalid node name %s\n", update_node_msg -> node_names ); @@ -1168,7 +1161,6 @@ update_node ( update_node_msg_t * update_node_msg ) return ESLURM_INVALID_NODE_NAME; } - lock_slurmctld (node_write_lock); last_node_update = time (NULL); while ( (this_node_name = hostlist_shift (host_list)) ) { node_record_point = find_node_record (this_node_name); @@ -1201,7 +1193,6 @@ update_node ( update_node_msg_t * update_node_msg ) free (this_node_name); } - unlock_slurmctld (node_write_lock); hostlist_destroy (host_list); return error_code; } @@ -1223,15 +1214,10 @@ validate_node_specs (char *node_name, uint32_t cpus, int error_code; struct config_record *config_ptr; struct node_record *node_ptr; - /* Locks: Write node */ - slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; - lock_slurmctld (node_write_lock); node_ptr = find_node_record (node_name); - if (node_ptr == NULL) { - unlock_slurmctld (node_write_lock); + if (node_ptr == NULL) return ENOENT; - } node_ptr->last_response = last_node_update = time (NULL); config_ptr = node_ptr->config_ptr; @@ -1275,6 +1261,5 @@ validate_node_specs (char *node_name, uint32_t cpus, bit_set (up_node_bitmap, (node_ptr - node_record_table_ptr)); } - unlock_slurmctld (node_write_lock); return error_code; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 247e3b0fd4c..53e95a4da13 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -305,7 +305,7 @@ match_feature (char *seek, char *available) if (seek == NULL) return 1; /* nothing to look for */ if (available == NULL) - return 0; /* nothing to find */ + return SLURM_SUCCESS; /* nothing to find */ tmp_available = xmalloc (strlen (available) + 1); strcpy (tmp_available, available); @@ -620,7 +620,7 @@ pick_best_nodes (struct node_set *node_set_ptr, int node_set_size, if ((shared != 1) && (bit_super_set (req_bitmap[0], idle_node_bitmap) != 1)) return EAGAIN; - return 0; /* user can have selected nodes, we're done! */ + return SLURM_SUCCESS; /* user can have selected nodes, we're done! */ } total_nodes = total_cpus = 0; /* reinitialize */ } @@ -688,7 +688,7 @@ pick_best_nodes (struct node_set *node_set_ptr, int node_set_size, if (req_bitmap[0]) bit_free (req_bitmap[0]); req_bitmap[0] = avail_bitmap; - return 0; + return SLURM_SUCCESS; } } diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index 9792902ddb7..7602e2e3248 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -348,7 +348,7 @@ dump_all_part_state ( void ) part_record_iterator = list_iterator_create (part_list); while ((part_record_point = (struct part_record *) list_next (part_record_iterator))) { if (part_record_point->magic != PART_MAGIC) - fatal ("pack_all_part: data integrity is bad"); + fatal ("dump_all_part_state: data integrity is bad"); dump_part_state (part_record_point, &buf_ptr, &buf_len); if (buf_len > BUF_SIZE) @@ -652,15 +652,12 @@ pack_all_part (char **buffer_ptr, int *buffer_size, time_t * update_time) char *buffer; void *buf_ptr; int parts_packed; - /* Locks: Read partition */ - slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; buffer_ptr[0] = NULL; *buffer_size = 0; if (*update_time == last_part_update) return; - lock_slurmctld (part_read_lock); buffer_allocated = (BUF_SIZE*16); buffer = xmalloc(buffer_allocated); buf_ptr = buffer; @@ -691,7 +688,6 @@ pack_all_part (char **buffer_ptr, int *buffer_size, time_t * update_time) } list_iterator_destroy (part_record_iterator); - unlock_slurmctld (part_read_lock); buffer_offset = (char *)buf_ptr - buffer; xrealloc (buffer, buffer_offset); @@ -761,8 +757,6 @@ update_part (update_part_msg_t * part_desc ) { int error_code, i; struct part_record *part_ptr; - /* Locks: Read node, write partition */ - slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK }; if ((part_desc -> name == NULL ) || (strlen (part_desc->name ) >= MAX_NAME_LEN)) { @@ -771,7 +765,6 @@ update_part (update_part_msg_t * part_desc ) } error_code = 0; - lock_slurmctld (part_write_lock); part_ptr = list_find_first (part_list, &list_find_part, part_desc->name); if (part_ptr == NULL) { @@ -851,6 +844,5 @@ update_part (update_part_msg_t * part_desc ) } } - unlock_slurmctld (part_write_lock); return error_code; } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 6da6367c118..4fad7b42acd 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -844,21 +844,17 @@ read_slurm_conf (int recover) { int line_num; /* line number in input file */ char in_line[BUF_SIZE]; /* input line */ int i, j, error_code; - /* Locks: Write configuration, write job, write node, write partition */ - slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; int old_node_record_count; struct node_record *old_node_table_ptr; struct node_record *node_record_point; /* initialization */ - lock_slurmctld (config_write_lock); start_time = clock (); old_node_record_count = node_record_count; old_node_table_ptr = node_record_table_ptr; /* save node states for reconfig RPC */ node_record_table_ptr = NULL; if ( (error_code = init_slurm_conf ()) ) { node_record_table_ptr = old_node_table_ptr; - unlock_slurmctld (config_write_lock); return error_code; } @@ -879,7 +875,6 @@ read_slurm_conf (int recover) { if (old_node_table_ptr) xfree (old_node_table_ptr); fclose (slurm_spec_file); - unlock_slurmctld (config_write_lock); return E2BIG; break; } @@ -908,7 +903,6 @@ read_slurm_conf (int recover) { fclose (slurm_spec_file); if (old_node_table_ptr) xfree (old_node_table_ptr); - unlock_slurmctld (config_write_lock); return error_code; } @@ -917,7 +911,6 @@ read_slurm_conf (int recover) { fclose (slurm_spec_file); if (old_node_table_ptr) xfree (old_node_table_ptr); - unlock_slurmctld (config_write_lock); return error_code; } @@ -926,7 +919,6 @@ read_slurm_conf (int recover) { fclose (slurm_spec_file); if (old_node_table_ptr) xfree (old_node_table_ptr); - unlock_slurmctld (config_write_lock); return error_code; } @@ -941,7 +933,6 @@ read_slurm_conf (int recover) { if (slurmctld_conf.control_machine == NULL) { fatal ("read_slurm_conf: control_machine value not specified."); - unlock_slurmctld (config_write_lock); return EINVAL; } @@ -949,7 +940,6 @@ read_slurm_conf (int recover) { error ("read_slurm_conf: default partition not set."); if (old_node_table_ptr) xfree (old_node_table_ptr); - unlock_slurmctld (config_write_lock); return EINVAL; } @@ -957,7 +947,6 @@ read_slurm_conf (int recover) { error ("read_slurm_conf: no nodes configured."); if (old_node_table_ptr) xfree (old_node_table_ptr); - unlock_slurmctld (config_write_lock); return EINVAL; } @@ -979,10 +968,8 @@ read_slurm_conf (int recover) { (void) load_job_state (); } - if ((error_code = build_bitmaps ())) { - unlock_slurmctld (config_write_lock); + if ((error_code = build_bitmaps ())) return error_code; - } if (recover) { (void) sync_nodes_to_jobs (); } @@ -994,7 +981,6 @@ read_slurm_conf (int recover) { info ("read_slurm_conf: finished loading configuration, time=%ld", (long) (clock () - start_time)); - unlock_slurmctld (config_write_lock); return SLURM_SUCCESS; } diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 588f5b4704d..86acc14ed03 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -299,41 +299,27 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) int nprocs = step_specs->cpu_count; int node_set_size = QSW_MAX_TASKS; /* overkill but safe */ #endif - /* Locks: Write jobs, read nodes */ - slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; *new_step_record = NULL; - lock_slurmctld (job_write_lock); job_ptr = find_job_record (step_specs->job_id); - if (job_ptr == NULL) { - unlock_slurmctld (job_write_lock); + if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID ; - } if (step_specs->user_id != job_ptr->user_id && - step_specs->user_id != 0) { - unlock_slurmctld (job_write_lock); + step_specs->user_id != 0) return ESLURM_ACCESS_DENIED ; - } if ((job_ptr->job_state == JOB_COMPLETE) || (job_ptr->job_state == JOB_FAILED) || (job_ptr->job_state == JOB_TIMEOUT) || - (job_ptr->job_state == JOB_STAGE_OUT)) { - unlock_slurmctld (job_write_lock); + (job_ptr->job_state == JOB_STAGE_OUT)) return ESLURM_ALREADY_DONE; - } nodeset = pick_step_nodes (job_ptr, step_specs ); - if (nodeset == NULL) { - unlock_slurmctld (job_write_lock); + if (nodeset == NULL) return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ; - } - /* FIXME need to set the error codes and define them - * probably shouldn't exit w/ a fatal... - */ step_ptr = create_step_record (job_ptr); if (step_ptr == NULL) fatal ("create_step_record failed with no memory"); @@ -366,7 +352,6 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record ) #endif *new_step_record = step_ptr; - unlock_slurmctld (job_write_lock); return SLURM_SUCCESS; } -- GitLab