diff --git a/NEWS b/NEWS index f5fca0f98ca817c8b02beddda5404338a05dadeb..da86a648e71bbcbc6c1adc5b4853705abdb05399 100644 --- a/NEWS +++ b/NEWS @@ -60,6 +60,9 @@ documents those changes that are of interest to users and administrators. older directory structure formats. -- Prevent triggering gang scheduling within a partition if configured with PreemptType=partition_prio and PreemptMode=suspend,gang. + -- Decrease parallelism in job cancel request to prevent denial of service + when cancelling huge numbers of jobs. + -- If all ephemeral ports are in use, try using other port numbers. * Changes in Slurm 15.08.5 ========================== diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 54de11da675a98f17cbf3bb3782a39dde8f69368..a0171c96b194467ed32afb9a65658e9021815366 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -2747,17 +2747,17 @@ slurm_fd_t slurm_init_msg_engine_port(uint16_t port) { slurm_fd_t cc; slurm_addr_t addr; - int cnt; + int i; - cnt = 0; -eagain: slurm_setup_sockaddr(&addr, port); cc = slurm_init_msg_engine(&addr); - if (cc < 0 && port == 0) { - ++cnt; - if (cnt <= 5) { - usleep(5000); - goto eagain; + if ((cc < 0) && (port == 0) && (errno == EADDRINUSE)) { + /* All ephemeral ports are in use, test other ports */ + for (i = 10001; i < 65536; i++) { + slurm_setup_sockaddr(&addr, i); + cc = slurm_init_msg_engine(&addr); + if (cc >= 0) + break; } } return cc; diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index 40f56bf56c6b1725816d9d5e8f5ee42cd5942e4c..a9c094528b425224f610f6b0742f1e70ee28fc77 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -70,7 +70,7 @@ #include "src/scancel/scancel.h" #define MAX_CANCEL_RETRY 10 -#define MAX_THREADS 20 +#define MAX_THREADS 2 static int _cancel_jobs (int filter_cnt); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 335b9601b95a85e655b4d0773bf458521f5e6f32..e1e306731a1626895ff72ed03c0bd48b63f5b95a 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -5442,6 +5442,7 @@ extern void free_rpc_stats(void) inline static void _slurm_rpc_kill_job2(slurm_msg_t *msg) { + static int active_rpc_cnt = 0; DEF_TIMERS; job_step_kill_msg_t *kill; slurmctld_lock_t lock = {READ_LOCK, WRITE_LOCK, @@ -5456,13 +5457,16 @@ _slurm_rpc_kill_job2(slurm_msg_t *msg) info("%s: REQUEST_KILL_JOB job %s uid %d", __func__, kill->sjob_id, uid); + _throttle_start(&active_rpc_cnt); lock_slurmctld(lock); - cc = job_str_signal(kill->sjob_id, kill->signal, kill->flags, uid, 0); + unlock_slurmctld(lock); + _throttle_fini(&active_rpc_cnt); + if (cc == ESLURM_ALREADY_DONE) { debug2("%s: job_str_signal() job %s sig %d returned %s", __func__, kill->sjob_id, @@ -5477,7 +5481,6 @@ _slurm_rpc_kill_job2(slurm_msg_t *msg) slurm_send_rc_msg(msg, cc); - unlock_slurmctld(lock); END_TIMER2("_slurm_rpc_kill_job2"); }