From c575466317f09bc083587089d1044585f168c5a2 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Mon, 8 Jun 2009 19:24:52 +0000 Subject: [PATCH] svn merge -r17738:17749 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0 --- NEWS | 6 +++++ doc/html/power_save.shtml | 4 ---- doc/man/man5/slurm.conf.5 | 4 ---- src/common/slurm_protocol_api.c | 24 ++++++++++++++----- .../mysql/mysql_jobacct_process.c | 18 +++++++------- src/slurmctld/power_save.c | 15 ++++++++---- 6 files changed, 44 insertions(+), 27 deletions(-) diff --git a/NEWS b/NEWS index 4e60fb0257a..d7b27ed8aba 100644 --- a/NEWS +++ b/NEWS @@ -46,6 +46,12 @@ documents those changes that are of interest to users and admins. -- Fix node weight (scheduling priority) calculation for powered down nodes. Patch from Hongjia Cao, NUDT. -- Fix node suspend/resume rate calculations. Patch from Hongjia Cao, NUDT. + -- Change calculations using ResumeRate and SuspendRate to provide higher + resolution. + -- Log the IP address for incomming messages having an invalid protocol + version number. + -- Fix for sacct to show jobs that start the same second as the sacct + command is issued. * Changes in SLURM 2.0.1 ======================== diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml index 0254e701892..763584b42d0 100644 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -43,8 +43,6 @@ Maximum number of nodes to be placed into power saving mode per minute. A value of zero results in no limits being imposed. The default value is 60. -Rate calculations are performed using integers, so rounding errors -may be significant for very small values (1 or 2 nodes per minute). Use this to prevent rapid drops in power requirements.</li> <li><b>ResumeRate</b>: @@ -52,8 +50,6 @@ Maximum number of nodes to be removed from power saving mode per minute. A value of zero results in no limits being imposed. The default value is 300. -Rate calculations are performed using integers, so rounding errors -may be significant for very small values (1 or 2 nodes per minute). Use this to prevent rapid increases in power requirements.</li> <li><b>SuspendProgram</b>: diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e1d931226ef..d82014881cf 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1020,8 +1020,6 @@ power surges if a large number of nodes in power save mode are assigned work at the same time (e.g. a large job starts). A value of zero results in no limits being imposed. The default value is 300 nodes per minute. -Rate calculations are performed using integers, so rounding errors -may be significant for very small values (1 or 2 nodes per minute). Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, \fBResumeTimeout\fR, \fBSuspendProgram\fR, \fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. @@ -1443,8 +1441,6 @@ The value is number of nodes per minute and it can be used to prevent a large drop in power power consumption (e.g. after a large job completes). A value of zero results in no limits being imposed. The default value is 60 nodes per minute. -Rate calculations are performed using integers, so rounding errors -may be significant for very small values (1 or 2 nodes per minute). Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, \fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendTime\fR, \fBSuspendTimeout\fR, \fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index fb8c0ee9387..1ab6214c1b8 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1905,9 +1905,13 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout) } if (check_header_version(&header) < 0) { + slurm_addr resp_addr; + char addr_str[32]; int uid = _unpack_msg_uid(buffer); - error("Invalid Protocol Version %u from uid=%d", - header.version, uid); + slurm_get_peer_addr(fd, &resp_addr); + slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str)); + error("Invalid Protocol Version %u from uid=%d at %s", + header.version, uid, addr_str); free_buf(buffer); rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; @@ -2067,9 +2071,13 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) } if(check_header_version(&header) < 0) { + slurm_addr resp_addr; + char addr_str[32]; int uid = _unpack_msg_uid(buffer); - error("Invalid Protocol Version %u from uid=%d", - header.version, uid); + slurm_get_peer_addr(fd, &resp_addr); + slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str)); + error("Invalid Protocol Version %u from uid=%d at %s", + header.version, uid, addr_str); free_buf(buffer); rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; @@ -2250,9 +2258,13 @@ int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, } if (check_header_version(&header) < 0) { + slurm_addr resp_addr; + char addr_str[32]; int uid = _unpack_msg_uid(buffer); - error("Invalid Protocol Version %u from uid=%d", - header.version, uid); + slurm_get_peer_addr(fd, &resp_addr); + slurm_print_slurm_addr(&resp_addr, addr_str, sizeof(addr_str)); + error("Invalid Protocol Version %u from uid=%d at %s", + header.version, uid, addr_str); free_buf(buffer); rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; diff --git a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c index 663ef55da69..938c163ee22 100644 --- a/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/mysql_jobacct_process.c @@ -470,7 +470,6 @@ extern int setup_job_cond_limits(mysql_conn_t *mysql_conn, char *object = NULL; char *table_level = "t2"; jobacct_selected_step_t *selected_step = NULL; - time_t now = time(NULL); if(!job_cond) return 0; @@ -651,17 +650,20 @@ extern int setup_job_cond_limits(mysql_conn_t *mysql_conn, } if(job_cond->usage_start) { - if(!job_cond->usage_end) - job_cond->usage_end = now; - if(*extra) xstrcat(*extra, " && ("); else xstrcat(*extra, " where ("); - xstrfmtcat(*extra, - "(t1.eligible < %d " - "&& (t1.end >= %d || t1.end = 0)))", - job_cond->usage_end, job_cond->usage_start); + + if(!job_cond->usage_end) + xstrfmtcat(*extra, + "t1.end >= %d || t1.end = 0)", + job_cond->usage_start); + else + xstrfmtcat(*extra, + "(t1.eligible < %d " + "&& (t1.end >= %d || t1.end = 0)))", + job_cond->usage_end, job_cond->usage_start); } else if(job_cond->usage_end) { if(*extra) xstrcat(*extra, " && ("); diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index d16a5719982..9e2ae879a81 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -83,7 +83,8 @@ time_t last_config = (time_t) 0, last_suspend = (time_t) 0; uint16_t slurmd_timeout; bitstr_t *exc_node_bitmap = NULL, *suspend_node_bitmap = NULL; -int suspend_cnt, resume_cnt; +int suspend_cnt, resume_cnt; +float suspend_cnt_f, resume_cnt_f; static void _clear_power_config(void); static void _do_power_work(void); @@ -111,13 +112,15 @@ static void _do_power_work(void) /* Set limit on counts of nodes to have state changed */ delta_t = now - last_work_scan; if (delta_t >= 60) { - suspend_cnt = 0; - resume_cnt = 0; + suspend_cnt_f = 0.0; + resume_cnt_f = 0.0; } else { float rate = (60 - delta_t) / 60.0; - suspend_cnt *= rate; - resume_cnt *= rate; + suspend_cnt_f *= rate; + resume_cnt_f *= rate; } + suspend_cnt = (suspend_cnt_f + 0.5); + resume_cnt = (resume_cnt_f + 0.5); if (now > (last_suspend + suspend_timeout)) { /* ready to start another round of node suspends */ @@ -151,6 +154,7 @@ static void _do_power_work(void) } wake_cnt++; resume_cnt++; + resume_cnt_f++; node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); bit_clear(power_node_bitmap, i); node_ptr->node_state |= NODE_STATE_NO_RESPOND; @@ -173,6 +177,7 @@ static void _do_power_work(void) } sleep_cnt++; suspend_cnt++; + suspend_cnt_f++; node_ptr->node_state |= NODE_STATE_POWER_SAVE; bit_set(power_node_bitmap, i); bit_set(sleep_node_bitmap, i); -- GitLab