diff --git a/NEWS b/NEWS index 2a24967ead6290b4a43c73315c21ae6e6af0e02a..5c2bbce4218675af8b24e3289a04b6d5f66bd5ad 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,12 @@ documents those changes that are of interest to users and admins. limits). -- Fix issue with license used count when doing an scontrol reconfig. -- Fix the PMI iterator to not report duplicated keys. + -- Fix issue with sinfo when -o is used without the %P option. + -- Rather than immediately invoking an execution of the scheduling logic on + every event type that can enable the execution of a new job, queue its + execution. This permits faster execution of some operations, such as + modifying large counts of jobs, by executing the scheduling logic less + frequently, but still in a timely fashion. * Changes in Slurm 14.03.0 ========================== diff --git a/src/api/pmi.c b/src/api/pmi.c index 84a1f4a05b4665b225285e9454b16399693c70bc..5aaeab75c8d369cd48168432cc8f191616ec1834 100644 --- a/src/api/pmi.c +++ b/src/api/pmi.c @@ -103,8 +103,9 @@ #define KVS_STATE_LOCAL 0 #define KVS_STATE_DEFUNCT 1 -#define KVS_KEY_STATE_GLOBAL 0 -#define KVS_KEY_STATE_LOCAL 1 +#define KVS_KEY_STATE_GLOBAL 0 +#define KVS_KEY_STATE_LOCAL 1 +#define KVS_KEY_STATE_DISABLED 2 /* default key names form is jobid.stepid[.taskid.sequence] */ struct kvs_rec { @@ -139,6 +140,7 @@ inline static void _kvs_dump(void); static int _kvs_put( const char kvsname[], const char key[], const char value[], int local); static void _kvs_swap(struct kvs_rec *kvs_ptr, int inx1, int inx2); +static void _kvs_disable_local_keys(void); /* Global variables */ long pmi_jobid; @@ -676,6 +678,9 @@ int PMI_Barrier( void ) if (pmi_debug) fprintf(stderr, "Past PMI_Barrier\n"); + + _kvs_disable_local_keys(); + for (i=0; i<kvs_set_ptr->kvs_comm_recs; i++) { kvs_ptr = kvs_set_ptr->kvs_comm_ptr[i]; for (j=0; j<kvs_ptr->kvs_cnt; j++) { @@ -1454,7 +1459,7 @@ int PMI_KVS_Iter_first(const char kvsname[], char key[], int key_len, char val[] for (j = 0; i < kvs_recs[i].kvs_cnt; j++) { - if (kvs_recs[i].kvs_key_states[j] == KVS_KEY_STATE_LOCAL) + if (kvs_recs[i].kvs_key_states[j] == KVS_KEY_STATE_DISABLED) continue; strncpy(key, kvs_recs[i].kvs_keys[j], key_len); @@ -1532,7 +1537,7 @@ int PMI_KVS_Iter_next(const char kvsname[], char key[], int key_len, } for (j = kvs_recs[i].kvs_inx; j < kvs_recs[i].kvs_cnt; j++) { - if (kvs_recs[i].kvs_key_states[j] == KVS_KEY_STATE_LOCAL) + if (kvs_recs[i].kvs_key_states[j] == KVS_KEY_STATE_DISABLED) continue; strncpy(key, kvs_recs[i].kvs_keys[j], key_len); @@ -1962,3 +1967,23 @@ inline static void _kvs_dump(void) } #endif } + +/* _kvs_disable_local_keys() + * + * The PMI_Barrier() call returns all [key,val] from all + * ranks including myself. As such there are duplicated + * keys with state LOCAL and GLOBAL, disdable the LOCAL + * one so the iterator won't return duplicated keys. + */ +static void +_kvs_disable_local_keys(void) +{ + int i; + int j; + + for (i = 0; i < kvs_rec_cnt; i++) { + for (j = 0; j < kvs_recs[i].kvs_cnt; j++) + if (kvs_recs[i].kvs_key_states[j] == KVS_KEY_STATE_LOCAL) + kvs_recs[i].kvs_key_states[j] = KVS_KEY_STATE_DISABLED; + } +} diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index 71c6606355a9709d502b5807131e8923039e5bd8..b9863abeab0d7da364fdc793190d4c29df717b63 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -802,6 +802,9 @@ static bool _match_part_data(sinfo_data_t *sinfo_ptr, if ((part_ptr == NULL) || (sinfo_ptr->part_info == NULL)) return false; + if ((_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) + return false; + if (params.match_flags.avail_flag && (part_ptr->state_up != sinfo_ptr->part_info->state_up)) return false; @@ -827,10 +830,6 @@ static bool _match_part_data(sinfo_data_t *sinfo_ptr, (part_ptr->max_time != sinfo_ptr->part_info->max_time)) return false; - if (params.match_flags.partition_flag && - (_strcmp(part_ptr->name, sinfo_ptr->part_info->name))) - return false; - if (params.match_flags.root_flag && ((part_ptr->flags & PART_FLAG_ROOT_ONLY) != (sinfo_ptr->part_info->flags & PART_FLAG_ROOT_ONLY))) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b6ec95ca2468954b6a2d28c2ece36d3958dce677..c3ca0f029a4d76d8c0433cde2e0a71f7f193e2f4 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -190,6 +190,8 @@ static pid_t slurmctld_pid; static char *slurm_conf_filename; static int primary = 1 ; static pthread_t assoc_cache_thread = (pthread_t) 0; +static pthread_mutex_t sched_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; +static int job_sched_cnt = 0; /* * Static list of signals to block in this process @@ -779,12 +781,20 @@ static int _reconfigure_slurm(void) start_power_mgr(&slurmctld_config.thread_id_power); trigger_reconfig(); priority_g_reconfig(true); /* notify priority plugin too */ - schedule(0); /* has its own locks */ - save_all_state(); + save_all_state(); /* Has own locking */ + queue_job_scheduler(); return rc; } +/* Request that the job scheduler execute soon (typically within seconds) */ +extern void queue_job_scheduler(void) +{ + slurm_mutex_lock(&sched_cnt_mutex); + job_sched_cnt++; + slurm_mutex_unlock(&sched_cnt_mutex); +} + /* _slurmctld_signal_hand - Process daemon-wide signals */ static void *_slurmctld_signal_hand(void *no_data) { @@ -1318,10 +1328,10 @@ static void *_slurmctld_background(void *no_data) static time_t last_uid_update; static time_t last_reboot_msg_time; static bool ping_msg_sent = false; - static bool run_job_scheduler = false; time_t now; int no_resp_msg_interval, ping_interval, purge_job_interval; int group_time, group_force; + uint32_t job_limit; DEF_TIMERS; /* Locks: Read config */ @@ -1418,7 +1428,7 @@ static void *_slurmctld_background(void *no_data) last_resv_time = now; lock_slurmctld(node_write_lock); if (set_node_maint_mode(false) > 0) - run_job_scheduler = true; + queue_job_scheduler(); unlock_slurmctld(node_write_lock); } @@ -1540,12 +1550,23 @@ static void *_slurmctld_background(void *no_data) unlock_slurmctld(job_write_lock); } - if ((difftime(now, last_sched_time) >= PERIODIC_SCHEDULE) || - run_job_scheduler) { + job_limit = NO_VAL; + if (difftime(now, last_sched_time) >= PERIODIC_SCHEDULE) { + slurm_mutex_lock(&sched_cnt_mutex); + /* job_limit = job_sched_cnt; Ignored */ + job_limit = INFINITE; + job_sched_cnt = 0; + slurm_mutex_unlock(&sched_cnt_mutex); + } else if (job_sched_cnt) { + slurm_mutex_lock(&sched_cnt_mutex); + job_limit = 0; /* Default depth */ + job_sched_cnt = 0; + slurm_mutex_unlock(&sched_cnt_mutex); + } + if (job_limit != NO_VAL) { now = time(NULL); last_sched_time = now; - run_job_scheduler = false; - if (schedule(INFINITE)) + if (schedule(job_limit)) last_checkpoint_time = 0; /* force state save */ set_job_elig_time(); } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 22121f06565fa7caed5cec165754f7beabd1e1f7..6ea3d86ccef53db3cf2b7c14209e4a34908d8af4 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -747,7 +747,6 @@ extern int schedule(uint32_t job_limit) static int max_jobs_per_part = 0; static int defer_rpc_cnt = 0; time_t now = time(NULL), sched_start; - DEF_TIMERS; sched_start = now; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 708496466d843e12ace102a9dde5cc94c626d3dd..4811ae7b12ad73e46dc590bc7930175ee4c19837 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1597,9 +1597,9 @@ static void _slurm_rpc_epilog_complete(slurm_msg_t * msg) * of managed jobs. */ if (!defer_sched) - (void) schedule(0); - schedule_node_save(); - schedule_job_save(); + (void) schedule(0); /* Has own locking */ + schedule_node_save(); /* Has own locking */ + schedule_job_save(); /* Has own locking */ } /* NOTE: RPC has no response */ @@ -2316,12 +2316,7 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg) unlock_slurmctld(job_write_lock); END_TIMER2("_slurm_rpc_node_registration"); if (newly_up) { - static time_t last_schedule = (time_t) 0; - time_t now = time(NULL); - if (difftime(now, last_schedule) > 5) { - last_schedule = now; - schedule(0); /* has its own locks */ - } + queue_job_scheduler(); } } @@ -2661,8 +2656,8 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg) TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); priority_g_reconfig(false); /* notify priority plugin too */ - schedule(0); /* has its own locks */ - save_all_state(); + save_all_state(); /* has its own locks */ + queue_job_scheduler(); } } @@ -2953,12 +2948,7 @@ static void _slurm_rpc_step_update(slurm_msg_t *msg) /* _slurm_rpc_submit_batch_job - process RPC to submit a batch job */ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) { - static time_t config_update = 0; - static bool defer_sched = false; static int active_rpc_cnt = 0; - static pthread_mutex_t sched_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; - static int sched_cnt = 0; - int sched_now_cnt = 0; int error_code = SLURM_SUCCESS; DEF_TIMERS; uint32_t step_id = 0; @@ -2970,18 +2960,11 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - int schedule_cnt = 1; char *err_msg = NULL; START_TIMER; debug2("Processing RPC: REQUEST_SUBMIT_BATCH_JOB from uid=%d", uid); - if (config_update != slurmctld_conf.last_update) { - char *sched_params = slurm_get_sched_params(); - defer_sched = (sched_params && strstr(sched_params,"defer")); - xfree(sched_params); - } - slurm_msg_t_init(&response_msg); response_msg.flags = msg->flags; response_msg.protocol_version = msg->protocol_version; @@ -2997,11 +2980,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) error_code = ESLURM_INVALID_NODE_NAME; error("REQUEST_SUBMIT_BATCH_JOB lacks alloc_node from uid=%d", uid); } - if (error_code == SLURM_SUCCESS) { + if (error_code == SLURM_SUCCESS) error_code = validate_job_create_req(job_desc_msg); - if (job_desc_msg->array_bitmap) - schedule_cnt = 100; - } dump_job_desc(job_desc_msg); if (error_code == SLURM_SUCCESS) { _throttle_start(&active_rpc_cnt); @@ -3116,8 +3096,6 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) } else if (!job_ptr) { /* Mostly to avoid CLANG error */ fatal("job_allocate failed to allocate job, rc=%d",error_code); } else { - if (job_ptr->part_ptr_list) - schedule_cnt *= list_count(job_ptr->part_ptr_list); info("_slurm_rpc_submit_batch_job JobId=%u %s", job_ptr->job_id, TIME_STR); /* send job_ID */ @@ -3128,32 +3106,12 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) response_msg.data = &submit_msg; slurm_send_node_msg(msg->conn_fd, &response_msg); - /* In defer mode, avoid triggering the scheduler logic - * for every submit batch job request. */ - if (!defer_sched) { - slurm_mutex_lock(&sched_cnt_mutex); - sched_cnt += schedule_cnt; - slurm_mutex_unlock(&sched_cnt_mutex); - } + schedule_job_save(); /* Has own locks */ + schedule_node_save(); /* Has own locks */ + queue_job_scheduler(); } -fini: /* We need to use schedule() to initiate a batch job in order to run - * the various prologs, boot the node, etc. We also run schedule() - * even if this job could not start, say due to a higher priority job, - * since the locks are released above and we might start some other - * job here. We do not run schedule() on each batch submission to - * limit its overhead on large numbers of job submissions */ - slurm_mutex_lock(&sched_cnt_mutex); - if ((active_rpc_cnt == 0) || (sched_cnt > 32)) { - sched_now_cnt = sched_cnt; - sched_cnt = 0; - } - slurm_mutex_unlock(&sched_cnt_mutex); - if (sched_now_cnt) - (void) schedule(sched_now_cnt); /* has own locks */ - schedule_job_save(); /* has own locks */ - schedule_node_save(); /* has own locks */ - xfree(err_msg); +fini: xfree(err_msg); } /* _slurm_rpc_update_job - process RPC to update the configuration of a @@ -3188,9 +3146,9 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg) job_desc_msg->job_id, uid, TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); /* Below functions provide their own locking */ - schedule(0); schedule_job_save(); schedule_node_save(); + queue_job_scheduler(); } } @@ -3334,9 +3292,8 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg) } /* Below functions provide their own locks */ - if (schedule(0)) - schedule_job_save(); schedule_node_save(); + queue_job_scheduler(); trigger_reconfig(); } @@ -3385,12 +3342,8 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg) part_desc_ptr->name, TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); - /* NOTE: These functions provide their own locks */ - schedule_part_save(); - if (schedule(0)) { - schedule_job_save(); - schedule_node_save(); - } + schedule_part_save(); /* Has its locking */ + queue_job_scheduler(); } } @@ -3432,10 +3385,8 @@ static void _slurm_rpc_delete_partition(slurm_msg_t * msg) part_desc_ptr->name, TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); - /* NOTE: These functions provide their own locks */ - schedule(0); - save_all_state(); - + save_all_state(); /* Has own locking */ + queue_job_scheduler(); } } @@ -3492,11 +3443,7 @@ static void _slurm_rpc_resv_create(slurm_msg_t * msg) response_msg.data = &resv_resp_msg; slurm_send_node_msg(msg->conn_fd, &response_msg); - /* NOTE: These functions provide their own locks */ - if (schedule(0)) { - schedule_job_save(); - schedule_node_save(); - } + queue_job_scheduler(); } } @@ -3538,11 +3485,7 @@ static void _slurm_rpc_resv_update(slurm_msg_t * msg) resv_desc_ptr->name, TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); - /* NOTE: These functions provide their own locks */ - if (schedule(0)) { - schedule_job_save(); - schedule_node_save(); - } + queue_job_scheduler(); } } @@ -3585,12 +3528,7 @@ static void _slurm_rpc_resv_delete(slurm_msg_t * msg) resv_desc_ptr->name, TIME_STR); slurm_send_rc_msg(msg, SLURM_SUCCESS); - /* NOTE: These functions provide their own locks */ - if (schedule(0)) { - schedule_job_save(); - schedule_node_save(); - } - + queue_job_scheduler(); } } @@ -3844,10 +3782,10 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg) } else { info("_slurm_rpc_suspend(%s) for %u %s", op, sus_ptr->job_id, TIME_STR); - /* Functions below provide their own locking */ + + schedule_job_save(); /* Has own locking */ if (sus_ptr->op == SUSPEND_JOB) - (void) schedule(0); - schedule_job_save(); + queue_job_scheduler(); } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index cc4ad6e859f98a047b9bd1c4d4365cc7d04eba2a..5eebc2fcf81d9627d35fb96502c73e718705443c 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1653,6 +1653,9 @@ void purge_old_job(void); /* Convert a comma delimited list of QOS names into a bitmap */ extern void qos_list_build(char *qos, bitstr_t **qos_bits); +/* Request that the job scheduler execute soon (typically within seconds) */ +extern void queue_job_scheduler(void); + /* * rehash_jobs - Create or rebuild the job hash table. * NOTE: run lock_slurmctld before entry: Read config, write job diff --git a/testsuite/expect/inc3.11.7 b/testsuite/expect/inc3.11.7 index 54e76316e143554f59422466ba508f34bafd6e02..4f4b7ce23ceea4c6df98c9c3fda3bdd51de3c266 100644 --- a/testsuite/expect/inc3.11.7 +++ b/testsuite/expect/inc3.11.7 @@ -91,6 +91,7 @@ proc inc3_11_7 {} { exit 1 } + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -140,7 +141,7 @@ proc inc3_11_7 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -190,7 +191,7 @@ proc inc3_11_7 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -239,7 +240,7 @@ proc inc3_11_7 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -290,7 +291,7 @@ proc inc3_11_7 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -325,7 +326,7 @@ proc inc3_11_7 {} { exit $ret_code } - sleep 3 + sleep 10 # Show the job spawn $scontrol show job $job_id expect { diff --git a/testsuite/expect/inc3.11.9 b/testsuite/expect/inc3.11.9 index b6b56539538769392651575bb8152777ed306c3f..a8c363a6d9ed324dbb1c3b041762bf9f70d38710 100644 --- a/testsuite/expect/inc3.11.9 +++ b/testsuite/expect/inc3.11.9 @@ -227,7 +227,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -285,7 +285,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -344,7 +344,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -405,7 +405,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -463,7 +463,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { @@ -522,7 +522,7 @@ proc inc3_11_9 {} { exit 1 } - sleep 1 + sleep 10 # Show the job, make sure reservation tag is right spawn $scontrol show job $job_id expect { diff --git a/testsuite/expect/test2.8 b/testsuite/expect/test2.8 index 4d50c072345914cae46f72315fcebe59fba83cfa..3de02c4c31743ee6b98d825e77ff4fe43f36def4 100755 --- a/testsuite/expect/test2.8 +++ b/testsuite/expect/test2.8 @@ -103,6 +103,19 @@ if {$job_id2 == 0} { exec $bin_rm -f $file_in +if {[wait_for_job $job_id1 "RUNNING"] != 0} { + send_user "\nFAILURE: waiting for job $job_id1 to start\n" + cancel_job $job_id1 + cancel_job $job_id2 + exit 1 +} +if {[wait_for_job $job_id2 "RUNNING"] != 0} { + send_user "\nFAILURE: waiting for job $job_id2 to start\n" + cancel_job $job_id1 + cancel_job $job_id2 + exit 1 +} + # # Look for these jobs with scontrol #