diff --git a/NEWS b/NEWS index 3eca0aa52f7fbdfe90c472be5c71ecc889251c7a..3ba9fab3dca3f838e371eac3e1087f6a702cd2bf 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,8 @@ documents those changes that are of interest to users and admins. -- Fix preempt/partition_prio to avoid preempting jobs in partitions with PreemptMode=OFF -- launch/poe - Implicitly set --network in job step create request as needed. + -- Permit multiple batch job submissions to be made for each run of the + scheduler logic if the job submissions occur at the nearly same time. * Changes in Slurm 2.6.7 ======================== diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 7c941100e0aec4485995de3c101ebf0675ca5c35..7aa829a46bf56a67076d19d080a1babc13a1247f 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -2776,6 +2776,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) static time_t config_update = 0; static bool defer_sched = false; static int active_rpc_cnt = 0; + static pthread_mutex_t sched_cnt_mutex = PTHREAD_MUTEX_INITIALIZER; + static int sched_cnt = 0; + int sched_now_cnt = 0; int error_code = SLURM_SUCCESS; DEF_TIMERS; uint32_t step_id = 0; @@ -2822,6 +2825,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) if (error_code == SLURM_SUCCESS) { _throttle_start(&active_rpc_cnt); lock_slurmctld(job_write_lock); + START_TIMER; /* Restart after we have locks */ if (job_desc_msg->job_id != SLURM_BATCH_SCRIPT) { job_ptr = find_job_record(job_desc_msg->job_id); if (job_ptr && IS_JOB_FINISHED(job_ptr)) { @@ -2833,7 +2837,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) ESLURM_DUPLICATE_JOB_ID); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); - return; + goto fini; } job_ptr = NULL; /* OK to re-use job id */ } @@ -2852,7 +2856,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurm_send_rc_msg(msg, ESLURM_NO_STEPS); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); - return; + goto fini; } #endif @@ -2865,14 +2869,14 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); - return; + goto fini; } if (job_ptr->details && job_ptr->details->prolog_running) { slurm_send_rc_msg(msg, EAGAIN); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); - return; + goto fini; } error_code = _launch_batch_step(job_desc_msg, uid, @@ -2900,7 +2904,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) &response_msg); schedule_job_save(); } - return; + goto fini; } /* Create new job allocation */ @@ -2936,20 +2940,32 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB; response_msg.data = &submit_msg; slurm_send_node_msg(msg->conn_fd, &response_msg); - /* We need to use schedule() to initiate a batch job in order - * to run the various prologs, boot the node, etc. - * We also run schedule() even if this job could not start, - * say due to a higher priority job, since the locks are - * released above and we might start some other job here. - * - * In defer mode, avoid triggering the scheduler logic - * for every submit batch job request. - */ - if (!defer_sched) - (void) schedule(schedule_cnt); /* has own locks */ + + /* In defer mode, avoid triggering the scheduler logic + * for every submit batch job request. */ + if (!defer_sched) { + slurm_mutex_lock(&sched_cnt_mutex); + sched_cnt += schedule_cnt; + slurm_mutex_unlock(&sched_cnt_mutex); + } schedule_job_save(); /* has own locks */ schedule_node_save(); /* has own locks */ } + +fini: /* We need to use schedule() to initiate a batch job in order to run + * the various prologs, boot the node, etc. We also run schedule() + * even if this job could not start, say due to a higher priority job, + * since the locks are released above and we might start some other + * job here. We do not run schedule() on each batch submission to + * limit its overhead on large numbers of job submissions */ + slurm_mutex_lock(&sched_cnt_mutex); + if ((active_rpc_cnt == 0) || (sched_cnt > 32)) { + sched_now_cnt = sched_cnt; + sched_cnt = 0; + } + slurm_mutex_unlock(&sched_cnt_mutex); + if (sched_now_cnt) + (void) schedule(sched_now_cnt); /* has own locks */ } /* _slurm_rpc_update_job - process RPC to update the configuration of a